libc/misc/wchar/wchar.c

   1
   2 /*  Copyright (C) 2002, 2003, 2004     Manuel Novoa III
   3  *
   4  *  This library is free software; you can redistribute it and/or
   5  *  modify it under the terms of the GNU Library General Public
   6  *  License as published by the Free Software Foundation; either
   7  *  version 2 of the License, or (at your option) any later version.
   8  *
   9  *  This library is distributed in the hope that it will be useful,
  10  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  11  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  12  *  Library General Public License for more details.
  13  *
  14  *  You should have received a copy of the GNU Library General Public
  15  *  License along with this library; if not, write to the Free
  16  *  Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  17  */
  18
  19 /*  ATTENTION!   ATTENTION!   ATTENTION!   ATTENTION!   ATTENTION!
  20  *
  21  *  Besides uClibc, I'm using this code in my libc for elks, which is
  22  *  a 16-bit environment with a fairly limited compiler.  It would make
  23  *  things much easier for me if this file isn't modified unnecessarily.
  24  *  In particular, please put any new or replacement functions somewhere
  25  *  else, and modify the makefile to use your version instead.
  26  *  Thanks.  Manuel
  27  *
  28  *  ATTENTION!   ATTENTION!   ATTENTION!   ATTENTION!   ATTENTION! */
  29
  30
  31 /* May 23, 2002     Initial Notes:
  32  *
  33  * I'm still tweaking this stuff, but it passes the tests I've thrown
  34  * at it, and Erik needs it for the gcc port.  The glibc extension
  35  * __wcsnrtombs() hasn't been tested, as I didn't find a test for it
  36  * in the glibc source.  I also need to fix the behavior of
  37  * _wchar_utf8sntowcs() if the max number of wchars to convert is 0.
  38  *
  39  * UTF-8 -> wchar -> UTF-8 conversion tests on Markus Kuhn's UTF-8-demo.txt
  40  * file on my platform (x86) show about 5-10% faster conversion speed than
  41  * glibc with mbsrtowcs()/wcsrtombs() and almost twice as fast as glibc with
  42  * individual mbrtowc()/wcrtomb() calls.
  43  *
  44  * If 'DECODER' is defined, then _wchar_utf8sntowcs() will be compiled
  45  * as a fail-safe UTF-8 decoder appropriate for a terminal, etc.  which
  46  * needs to deal gracefully with whatever is sent to it.  In that mode,
  47  * it passes Markus Kuhn's UTF-8-test.txt stress test.  I plan to add
  48  * an arg to force that behavior, so the interface will be changing.
  49  *
  50  * I need to fix the error checking for 16-bit wide chars.  This isn't
  51  * an issue for uClibc, but may be for ELKS.  I'm currently not sure
  52  * if I'll use 16-bit, 32-bit, or configureable wchars in ELKS.
  53  *
  54  * July 1, 2002
  55  *
  56  * Fixed _wchar_utf8sntowcs() for the max number of wchars == 0 case.
  57  * Fixed nul-char bug in btowc(), and another in __mbsnrtowcs() for 8-bit
  58  *    locales.
  59  * Enabled building of a C/POSIX-locale-only version, so full locale support
  60  *    no longer needs to be enabled.
  61  *
  62  * Nov 4, 2002
  63  *
  64  * Fixed a bug in _wchar_wcsntoutf8s().  Don't store wcs position if dst is NULL.
  65  * Also, introduce an awful hack into _wchar_wcsntoutf8s() and wcsrtombs() in
  66  *   order to support %ls in printf.  See comments below for details.
  67  * Change behaviour of wc<->mb functions when in the C locale.  Now they do
  68  *   a 1-1 map for the range 0x80-UCHAR_MAX.  This is for backwards compatibility
  69  *   and consistency with the stds requirements that a printf format string by
  70  *   a valid multibyte string beginning and ending in it's initial shift state.
  71  *
  72  * Nov 5, 2002
  73  *
  74  * Forgot to change btowc and wctob when I changed the wc<->mb functions yesterday.
  75  *
  76  * Nov 7, 2002
  77  *
  78  * Add wcwidth and wcswidth, based on Markus Kuhn's wcwidth of 2002-05-08.
  79  *   Added some size/speed optimizations and integrated it into my locale
  80  *   framework.  Minimally tested at the moment, but the stub C-locale
  81  *   version (which most people would probably be using) should be fine.
  82  *
  83  * Nov 21, 2002
  84  *
  85  * Revert the wc<->mb changes from earlier this month involving the C-locale.
  86  * Add a couple of ugly hacks to support *wprintf.
  87  * Add a mini iconv() and iconv implementation (requires locale support).
  88  *
  89  * Aug 1, 2003
  90  * Bug fix for mbrtowc.
  91  *
  92  * Aug 18, 2003
  93  * Bug fix: _wchar_utf8sntowcs and _wchar_wcsntoutf8s now set errno if EILSEQ.
  94  *
  95  * Feb 11, 2004
  96  * Bug fix: Fix size check for remaining output space in iconv().
  97  *
  98  * Manuel
  99  */
 100
 101 #include <errno.h>
 102 #include <stddef.h>
 103 #include <limits.h>
 104 #include <stdint.h>
 105 #include <inttypes.h>
 106 #include <stdlib.h>
 107 #include <stdio.h>
 108 #include <assert.h>
 109 #include <locale.h>
 110 #include <wchar.h>
 111 #include <bits/uClibc_uwchar.h>
 112
 113 /**********************************************************************/
 114 #ifdef __UCLIBC_HAS_LOCALE__
 115 #ifdef __UCLIBC_MJN3_ONLY__
 116 #ifdef L_iswspace
 117 /* generates one warning */
 118 #warning TODO: Fix Cc2wc* and Cwc2c* defines!
 119 #endif
 120 #endif /* __UCLIBC_MJN3_ONLY__ */
 121
 122 #define ENCODING                (__UCLIBC_CURLOCALE->encoding)
 123
 124 #define Cc2wc_IDX_SHIFT         __LOCALE_DATA_Cc2wc_IDX_SHIFT
 125 #define Cc2wc_ROW_LEN           __LOCALE_DATA_Cc2wc_ROW_LEN
 126 #define Cwc2c_DOMAIN_MAX        __LOCALE_DATA_Cwc2c_DOMAIN_MAX
 127 #define Cwc2c_TI_SHIFT          __LOCALE_DATA_Cwc2c_TI_SHIFT
 128 #define Cwc2c_TT_SHIFT          __LOCALE_DATA_Cwc2c_TT_SHIFT
 129 #define Cwc2c_TI_LEN            __LOCALE_DATA_Cwc2c_TI_LEN
 130
 131 #ifndef __CTYPE_HAS_UTF_8_LOCALES
 132 #warning __CTYPE_HAS_UTF_8_LOCALES not set!
 133 #endif
 134
 135 #else  /* __UCLIBC_HAS_LOCALE__ */
 136
 137 #ifdef __UCLIBC_MJN3_ONLY__
 138 #ifdef L_btowc
 139 /* emit only once */
 140 #warning fix preprocessor logic testing locale settings
 141 #endif
 142 #endif
 143
 144 #define ENCODING (__ctype_encoding_7_bit)
 145 #ifdef __CTYPE_HAS_8_BIT_LOCALES
 146 #error __CTYPE_HAS_8_BIT_LOCALES is defined!
 147 #endif
 148 #ifdef __CTYPE_HAS_UTF_8_LOCALES
 149 #error __CTYPE_HAS_UTF_8_LOCALES is defined!
 150 #endif
 151 #undef L__wchar_utf8sntowcs
 152 #undef L__wchar_wcsntoutf8s
 153
 154 #endif /* __UCLIBC_HAS_LOCALE__ */
 155 /**********************************************************************/
 156
 157 #if WCHAR_MAX > 0xffffUL
 158 #define UTF_8_MAX_LEN 6
 159 #else
 160 #define UTF_8_MAX_LEN 3
 161 #endif
 162
 163 #define KUHN 1
 164
 165 /* Implementation-specific work functions. */
 166
 167 extern size_t _wchar_utf8sntowcs(wchar_t *__restrict pwc, size_t wn,
 168                                         const char **__restrict src, size_t n,
 169                                         mbstate_t *ps, int allow_continuation) attribute_hidden;
 170
 171 extern size_t _wchar_wcsntoutf8s(char *__restrict s, size_t n,
 172                                         const wchar_t **__restrict src, size_t wn) attribute_hidden;
 173
 174 /**********************************************************************/
 175 #ifdef L_btowc
 176
 177 /* libc_hidden_proto(mbrtowc) */
 178
 179 /* libc_hidden_proto(btowc) */
 180 wint_t btowc(int c)
 181 {
 182 #ifdef __CTYPE_HAS_8_BIT_LOCALES
 183
 184         wchar_t wc;
 185         unsigned char buf[1];
 186         mbstate_t mbstate;
 187
 188         if (c != EOF) {
 189                 *buf = (unsigned char) c;
 190                 mbstate.__mask = 0;             /* Initialize the mbstate. */
 191                 if (mbrtowc(&wc, (char*) buf, 1, &mbstate) <= 1) {
 192                         return wc;
 193                 }
 194         }
 195         return WEOF;
 196
 197 #else  /* !__CTYPE_HAS_8_BIT_LOCALES */
 198
 199 #ifdef __UCLIBC_HAS_LOCALE__
 200         assert((ENCODING == __ctype_encoding_7_bit)
 201                    || (ENCODING == __ctype_encoding_utf8));
 202 #endif
 203
 204         /* If we don't have 8-bit locale support, then this is trivial since
 205          * anything outside of 0-0x7f is illegal in C/POSIX and UTF-8 locales. */
 206         return (((unsigned int)c) < 0x80) ? c : WEOF;
 207
 208 #endif /* !__CTYPE_HAS_8_BIT_LOCALES */
 209 }
 210 libc_hidden_def(btowc)
 211
 212 #endif
 213 /**********************************************************************/
 214 #ifdef L_wctob
 215
 216 /* Note: We completely ignore ps in all currently supported conversions. */
 217
 218 /* libc_hidden_proto(wcrtomb) */
 219
 220 int wctob(wint_t c)
 221 {
 222 #ifdef __CTYPE_HAS_8_BIT_LOCALES
 223
 224         unsigned char buf[MB_LEN_MAX];
 225
 226         return (wcrtomb((char*) buf, c, NULL) == 1) ? *buf : EOF;
 227
 228 #else  /*  __CTYPE_HAS_8_BIT_LOCALES */
 229
 230 #ifdef __UCLIBC_HAS_LOCALE__
 231         assert((ENCODING == __ctype_encoding_7_bit)
 232                    || (ENCODING == __ctype_encoding_utf8));
 233 #endif /* __UCLIBC_HAS_LOCALE__ */
 234
 235         /* If we don't have 8-bit locale support, then this is trivial since
 236          * anything outside of 0-0x7f is illegal in C/POSIX and UTF-8 locales. */
 237
 238         /* TODO: need unsigned version of wint_t... */
 239 /*      return (((unsigned int)c) < 0x80) ? c : WEOF; */
 240         return ((c >= 0) && (c < 0x80)) ? c : EOF;
 241
 242 #endif /*  __CTYPE_HAS_8_BIT_LOCALES */
 243 }
 244
 245 #endif
 246 /**********************************************************************/
 247 #ifdef L_mbsinit
 248
 249 /* libc_hidden_proto(mbsinit) */
 250 int mbsinit(const mbstate_t *ps)
 251 {
 252         return !ps || !ps->__mask;
 253 }
 254 libc_hidden_def(mbsinit)
 255
 256 #endif
 257 /**********************************************************************/
 258 #ifdef L_mbrlen
 259
 260 /* libc_hidden_proto(mbrtowc) */
 261
 262 /* libc_hidden_proto(mbrlen) */
 263 size_t mbrlen(const char *__restrict s, size_t n, mbstate_t *__restrict ps)
 264 {
 265         static mbstate_t mbstate;       /* Rely on bss 0-init. */
 266
 267         return mbrtowc(NULL, s, n, (ps != NULL) ? ps : &mbstate);
 268 }
 269 libc_hidden_def(mbrlen)
 270
 271 #endif
 272 /**********************************************************************/
 273 #ifdef L_mbrtowc
 274
 275 /* libc_hidden_proto(mbsnrtowcs) */
 276
 277 /* libc_hidden_proto(mbrtowc) */
 278 size_t mbrtowc(wchar_t *__restrict pwc, const char *__restrict s,
 279                            size_t n, mbstate_t *__restrict ps)
 280 {
 281         static mbstate_t mbstate;       /* Rely on bss 0-init. */
 282         wchar_t wcbuf[1];
 283         const char *p;
 284         size_t r;
 285         char empty_string[1];           /* Avoid static to be fPIC friendly. */
 286
 287         if (!ps) {
 288                 ps = &mbstate;
 289         }
 290
 291         if (!s) {
 292                 pwc = (wchar_t *) s;    /* NULL */
 293                 empty_string[0] = 0;    /* Init the empty string when necessary. */
 294                 s = empty_string;
 295                 n = 1;
 296         } else if (*s == '\0') {
 297     /* According to the ISO C 89 standard this is the expected behaviour.  */
 298                 return 0;
 299         } else if (!n) {
 300                 /* TODO: change error code? */
 301 #if 0
 302                 return (ps->__mask && (ps->__wc == 0xffffU))
 303                         ? ((size_t) -1) : ((size_t) -2);
 304 #else
 305                 return 0;
 306 #endif
 307         }
 308
 309         p = s;
 310
 311 #ifdef __CTYPE_HAS_UTF_8_LOCALES
 312         /* Need to do this here since mbsrtowcs doesn't allow incompletes. */
 313         if (ENCODING == __ctype_encoding_utf8) {
 314                 if (!pwc) {
 315                         pwc = wcbuf;
 316                 }
 317                 r = _wchar_utf8sntowcs(pwc, 1, &p, n, ps, 1);
 318                 return (r == 1) ? (p-s) : r; /* Need to return 0 if nul char. */
 319         }
 320 #endif
 321
 322 #ifdef __UCLIBC_MJN3_ONLY__
 323 #warning TODO: This adds a trailing nul!
 324 #endif /* __UCLIBC_MJN3_ONLY__ */
 325
 326         r = mbsnrtowcs(wcbuf, &p, SIZE_MAX, 1, ps);
 327
 328         if (((ssize_t) r) >= 0) {
 329                 if (pwc) {
 330                         *pwc = *wcbuf;
 331                 }
 332         }
 333         return (size_t) r;
 334 }
 335 libc_hidden_def(mbrtowc)
 336
 337 #endif
 338 /**********************************************************************/
 339 #ifdef L_wcrtomb
 340
 341 /* libc_hidden_proto(wcsnrtombs) */
 342
 343 /* Note: We completely ignore ps in all currently supported conversions. */
 344 /* TODO: Check for valid state anyway? */
 345
 346 /* libc_hidden_proto(wcrtomb) */
 347 size_t wcrtomb(register char *__restrict s, wchar_t wc,
 348                            mbstate_t *__restrict ps)
 349 {
 350 #ifdef __UCLIBC_MJN3_ONLY__
 351 #warning TODO: Should wcsnrtombs nul-terminate unconditionally?  Check glibc.
 352 #endif /* __UCLIBC_MJN3_ONLY__ */
 353         wchar_t wcbuf[1];
 354         const wchar_t *pwc;
 355         size_t r;
 356         char buf[MB_LEN_MAX];
 357
 358         if (!s) {
 359                 s = buf;
 360                 wc = 0;
 361         }
 362
 363         pwc = wcbuf;
 364         wcbuf[0] = wc;
 365
 366         r = wcsnrtombs(s, &pwc, 1, MB_LEN_MAX, ps);
 367         return (r != 0) ? r : 1;
 368 }
 369 libc_hidden_def(wcrtomb)
 370
 371 #endif
 372 /**********************************************************************/
 373 #ifdef L_mbsrtowcs
 374
 375 /* libc_hidden_proto(mbsnrtowcs) */
 376
 377 /* libc_hidden_proto(mbsrtowcs) */
 378 size_t mbsrtowcs(wchar_t *__restrict dst, const char **__restrict src,
 379                                  size_t len, mbstate_t *__restrict ps)
 380 {
 381         static mbstate_t mbstate;       /* Rely on bss 0-init. */
 382
 383         return mbsnrtowcs(dst, src, SIZE_MAX, len,
 384                                                 ((ps != NULL) ? ps : &mbstate));
 385 }
 386 libc_hidden_def(mbsrtowcs)
 387
 388 #endif
 389 /**********************************************************************/
 390 #ifdef L_wcsrtombs
 391
 392 /* Note: We completely ignore ps in all currently supported conversions.
 393
 394  * TODO: Check for valid state anyway? */
 395
 396 /* libc_hidden_proto(wcsnrtombs) */
 397
 398 /* libc_hidden_proto(wcsrtombs) */
 399 size_t wcsrtombs(char *__restrict dst, const wchar_t **__restrict src,
 400                                  size_t len, mbstate_t *__restrict ps)
 401 {
 402         return wcsnrtombs(dst, src, SIZE_MAX, len, ps);
 403 }
 404 libc_hidden_def(wcsrtombs)
 405
 406 #endif
 407 /**********************************************************************/
 408 #ifdef L__wchar_utf8sntowcs
 409
 410 /* Define DECODER to generate a UTF-8 decoder which passes Markus Kuhn's
 411  * UTF-8-test.txt strss test.
 412  */
 413 /*  #define DECODER */
 414
 415 #ifdef DECODER
 416 #ifndef KUHN
 417 #define KUHN
 418 #endif
 419 #endif
 420
 421 size_t attribute_hidden _wchar_utf8sntowcs(wchar_t *__restrict pwc, size_t wn,
 422                                                   const char **__restrict src, size_t n,
 423                                                   mbstate_t *ps, int allow_continuation)
 424 {
 425         register const char *s;
 426         __uwchar_t mask;
 427         __uwchar_t wc;
 428         wchar_t wcbuf[1];
 429         size_t count;
 430         int incr;
 431
 432         s = *src;
 433
 434         assert(s != NULL);
 435         assert(ps != NULL);
 436
 437         incr = 1;
 438         /* NOTE: The following is an AWFUL HACK!  In order to support %s in
 439          * wprintf, we need to be able to compute the number of wchars needed
 440          * for the mbs conversion, not to exceed the precision specified.
 441          * But if dst is NULL, the return value is the length assuming a
 442          * sufficiently sized buffer.  So, we allow passing of (wchar_t *) ps
 443          * as pwc in order to flag that we really want the length, subject
 444          * to the restricted buffer size and no partial conversions.
 445          * See mbsnrtowcs() as well. */
 446         if (!pwc || (pwc == ((wchar_t *)ps))) {
 447                 if (!pwc) {
 448                         wn = SIZE_MAX;
 449                 }
 450                 pwc = wcbuf;
 451                 incr = 0;
 452         }
 453
 454         /* This is really here only to support the glibc extension function
 455          * __mbsnrtowcs which apparently returns 0 if wn == 0 without any
 456          * check on the validity of the mbstate. */
 457         if (!(count = wn)) {
 458                 return 0;
 459         }
 460
 461         if ((mask = (__uwchar_t) ps->__mask) != 0) { /* A continuation... */
 462 #ifdef DECODER
 463                 wc = (__uwchar_t) ps->__wc;
 464                 if (n) {
 465                         goto CONTINUE;
 466                 }
 467                 goto DONE;
 468 #else
 469                 if ((wc = (__uwchar_t) ps->__wc) != 0xffffU) {
 470                         /* TODO: change error code here and below? */
 471                         if (n) {
 472                                 goto CONTINUE;
 473                         }
 474                         goto DONE;
 475                 }
 476                 __set_errno(EILSEQ);
 477                 return (size_t) -1;             /* We're in an error state. */
 478 #endif
 479         }
 480
 481         do {
 482                 if (!n) {
 483                         goto DONE;
 484                 }
 485                 --n;
 486                 if ((wc = ((unsigned char) *s++)) >= 0x80) { /* Not ASCII... */
 487                         mask = 0x40;
 488 #ifdef __UCLIBC_MJN3_ONLY__
 489 #warning TODO: Fix range for 16 bit wchar_t case.
 490 #endif
 491                         if (( ((unsigned char)(s[-1] - 0xc0)) < (0xfe - 0xc0) ) &&
 492                         (((unsigned char)s[-1] != 0xc0 ) && ((unsigned char)s[-1] != 0xc1 ))) {
 493                                 goto START;
 494                         }
 495                 BAD:
 496 #ifdef DECODER
 497                         wc = 0xfffdU;
 498                         goto COMPLETE;
 499 #else
 500                         ps->__mask = mask;
 501                         ps->__wc = 0xffffU;
 502                         __set_errno(EILSEQ);
 503                         return (size_t) -1;     /* Illegal start byte! */
 504 #endif
 505
 506                 CONTINUE:
 507                         while (n) {
 508                                 --n;
 509                                 if ((*s & 0xc0) != 0x80) {
 510                                         goto BAD;
 511                                 }
 512                                 mask <<= 5;
 513                                 wc <<= 6;
 514                                 wc += (*s & 0x3f);      /* keep seperate for bcc (smaller code) */
 515                                 ++s;
 516                         START:
 517                                 wc &= ~(mask << 1);
 518
 519                                 if ((wc & mask) == 0) { /* Character completed. */
 520                                         if ((mask >>= 5) == 0x40) {
 521                                                 mask += mask;
 522                                         }
 523                                         /* Check for invalid sequences (longer than necessary)
 524                                          * and invalid chars.  */
 525                                         if ( (wc < mask) /* Sequence not minimal length. */
 526 #ifdef KUHN
 527 #if UTF_8_MAX_LEN == 3
 528 #error broken since mask can overflow!!
 529                                                  /* For plane 0, these are the only defined values.*/
 530                                                  || (wc > 0xfffdU)
 531 #else
 532                                                  /* Note that we don't need to worry about exceeding */
 533                                                  /* 31 bits as that is the most that UTF-8 provides. */
 534                                                  || ( ((__uwchar_t)(wc - 0xfffeU)) < 2)
 535 #endif
 536                                                  || ( ((__uwchar_t)(wc - 0xd800U)) < (0xe000U - 0xd800U) )
 537 #endif /* KUHN */
 538                                                  ) {
 539                                                 goto BAD;
 540                                         }
 541                                         goto COMPLETE;
 542                                 }
 543                         }
 544                         /* Character potentially valid but incomplete. */
 545                         if (!allow_continuation) {
 546                                 if (count != wn) {
 547                                         return 0;
 548                                 }
 549                                 /* NOTE: The following can fail if you allow and then disallow
 550                                  * continuation!!! */
 551 #if UTF_8_MAX_LEN == 3
 552 #error broken since mask can overflow!!
 553 #endif
 554                                 /* Need to back up... */
 555                                 do {
 556                                         --s;
 557                                 } while ((mask >>= 5) >= 0x40);
 558                                 goto DONE;
 559                         }
 560                         ps->__mask = (wchar_t) mask;
 561                         ps->__wc = (wchar_t) wc;
 562                         *src = s;
 563                         return (size_t) -2;
 564                 }
 565         COMPLETE:
 566                 *pwc = wc;
 567                 pwc += incr;
 568         }
 569 #ifdef DECODER
 570         while (--count);
 571 #else
 572         while (wc && --count);
 573
 574         if (!wc) {
 575                 s = NULL;
 576         }
 577 #endif
 578
 579  DONE:
 580         /* ps->__wc is irrelavent here. */
 581         ps->__mask = 0;
 582         if (pwc != wcbuf) {
 583                 *src = s;
 584         }
 585
 586         return wn - count;
 587 }
 588
 589 #endif
 590 /**********************************************************************/
 591 #ifdef L__wchar_wcsntoutf8s
 592
 593 size_t attribute_hidden _wchar_wcsntoutf8s(char *__restrict s, size_t n,
 594                                                   const wchar_t **__restrict src, size_t wn)
 595 {
 596         register char *p;
 597         size_t len, t;
 598         __uwchar_t wc;
 599         const __uwchar_t *swc;
 600         int store;
 601         char buf[MB_LEN_MAX];
 602         char m;
 603
 604         store = 1;
 605         /* NOTE: The following is an AWFUL HACK!  In order to support %ls in
 606          * printf, we need to be able to compute the number of bytes needed
 607          * for the mbs conversion, not to exceed the precision specified.
 608          * But if dst is NULL, the return value is the length assuming a
 609          * sufficiently sized buffer.  So, we allow passing of (char *) src
 610          * as dst in order to flag that we really want the length, subject
 611          * to the restricted buffer size and no partial conversions.
 612          * See wcsnrtombs() as well. */
 613         if (!s || (s == ((char *) src))) {
 614                 if (!s) {
 615                         n = SIZE_MAX;
 616                 }
 617             s = buf;
 618                 store = 0;
 619         }
 620
 621         t = n;
 622         swc = (const __uwchar_t *) *src;
 623
 624         assert(swc != NULL);
 625
 626         while (wn && t) {
 627                 wc = *swc;
 628
 629                 *s = wc;
 630                 len = 1;
 631
 632                 if (wc >= 0x80) {
 633 #ifdef KUHN
 634                         if (
 635 #if UTF_8_MAX_LEN == 3
 636                                 /* For plane 0, these are the only defined values.*/
 637                                 /* Note that we don't need to worry about exceeding */
 638                                 /* 31 bits as that is the most that UTF-8 provides. */
 639                                 (wc > 0xfffdU)
 640 #else
 641                                 /* UTF_8_MAX_LEN == 6 */
 642                                 (wc > 0x7fffffffUL)
 643                                 || ( ((__uwchar_t)(wc - 0xfffeU)) < 2)
 644 #endif
 645                                 || ( ((__uwchar_t)(wc - 0xd800U)) < (0xe000U - 0xd800U) )
 646                                 ) {
 647                                 __set_errno(EILSEQ);
 648                                 return (size_t) -1;
 649                         }
 650 #else  /* KUHN */
 651 #if UTF_8_MAX_LEN != 3
 652                         if (wc > 0x7fffffffUL) { /* Value too large. */
 653                                 __set_errno(EILSEQ);
 654                                 return (size_t) -1;
 655                         }
 656 #endif
 657 #endif /* KUHN */
 658
 659                         wc >>= 1;
 660                         p = s;
 661                         do {
 662                                 ++p;
 663                         } while (wc >>= 5);
 664                         wc = *swc;
 665                         if ((len = p - s) > t) { /* Not enough space. */
 666                                 break;
 667                         }
 668
 669                         m = 0x80;
 670                         while( p>s ) {
 671                                 m = (m >> 1) | 0x80;
 672                                 *--p = (wc & 0x3f) | 0x80;
 673                                 wc >>= 6;
 674                         }
 675                         *s |= (m << 1);
 676                 } else if (wc == 0) {   /* End of string. */
 677                         swc = NULL;
 678                         break;
 679                 }
 680
 681                 ++swc;
 682                 --wn;
 683                 t -= len;
 684                 if (store) {
 685                         s += len;
 686                 }
 687         }
 688
 689         if (store) {
 690                 *src = (const wchar_t *) swc;
 691         }
 692
 693         return n - t;
 694 }
 695
 696
 697 #endif
 698 /**********************************************************************/
 699 #ifdef L_mbsnrtowcs
 700
 701 /* WARNING: We treat len as SIZE_MAX when dst is NULL! */
 702
 703 /* libc_hidden_proto(mbsnrtowcs) */
 704 size_t mbsnrtowcs(wchar_t *__restrict dst, const char **__restrict src,
 705                                         size_t NMC, size_t len, mbstate_t *__restrict ps)
 706 {
 707         static mbstate_t mbstate;       /* Rely on bss 0-init. */
 708         wchar_t wcbuf[1];
 709         const char *s;
 710         size_t count;
 711         int incr;
 712
 713         if (!ps) {
 714                 ps = &mbstate;
 715         }
 716
 717 #ifdef __CTYPE_HAS_UTF_8_LOCALES
 718         if (ENCODING == __ctype_encoding_utf8) {
 719                 size_t r;
 720                 return ((r = _wchar_utf8sntowcs(dst, len, src, NMC, ps, 1))
 721                                 != (size_t) -2) ? r : 0;
 722         }
 723 #endif
 724         incr = 1;
 725         /* NOTE: The following is an AWFUL HACK!  In order to support %s in
 726          * wprintf, we need to be able to compute the number of wchars needed
 727          * for the mbs conversion, not to exceed the precision specified.
 728          * But if dst is NULL, the return value is the length assuming a
 729          * sufficiently sized buffer.  So, we allow passing of ((wchar_t *)ps)
 730          * as dst in order to flag that we really want the length, subject
 731          * to the restricted buffer size and no partial conversions.
 732          * See _wchar_utf8sntowcs() as well. */
 733         if (!dst || (dst == ((wchar_t *)ps))) {
 734                 if (!dst) {
 735                         len = SIZE_MAX;
 736                 }
 737                 dst = wcbuf;
 738                 incr = 0;
 739         }
 740
 741         /* Since all the following encodings are single-byte encodings... */
 742         if (len > NMC) {
 743                 len = NMC;
 744         }
 745
 746         count = len;
 747         s = *src;
 748
 749 #ifdef __CTYPE_HAS_8_BIT_LOCALES
 750         if (ENCODING == __ctype_encoding_8_bit) {
 751                 wchar_t wc;
 752                 while (count) {
 753                         if ((wc = ((unsigned char)(*s))) >= 0x80) {     /* Non-ASCII... */
 754                                 wc -= 0x80;
 755                                 wc = __UCLIBC_CURLOCALE->tbl8c2wc[
 756                                                   (__UCLIBC_CURLOCALE->idx8c2wc[wc >> Cc2wc_IDX_SHIFT]
 757                                                    << Cc2wc_IDX_SHIFT) + (wc & (Cc2wc_ROW_LEN - 1))];
 758                                 if (!wc) {
 759                                         goto BAD;
 760                                 }
 761                         }
 762                         if (!(*dst = wc)) {
 763                                 s = NULL;
 764                                 break;
 765                         }
 766                         dst += incr;
 767                         ++s;
 768                         --count;
 769                 }
 770                 if (dst != wcbuf) {
 771                         *src = s;
 772                 }
 773                 return len - count;
 774         }
 775 #endif
 776
 777 #ifdef __UCLIBC_HAS_LOCALE__
 778         assert(ENCODING == __ctype_encoding_7_bit);
 779 #endif
 780
 781         while (count) {
 782                 if ((*dst = (unsigned char) *s) == 0) {
 783                         s = NULL;
 784                         break;
 785                 }
 786                 if (*dst >= 0x80) {
 787 #ifdef __CTYPE_HAS_8_BIT_LOCALES
 788                 BAD:
 789 #endif
 790                         __set_errno(EILSEQ);
 791                         return (size_t) -1;
 792                 }
 793                 ++s;
 794                 dst += incr;
 795                 --count;
 796         }
 797         if (dst != wcbuf) {
 798                 *src = s;
 799         }
 800         return len - count;
 801 }
 802 libc_hidden_def(mbsnrtowcs)
 803
 804 #endif
 805 /**********************************************************************/
 806 #ifdef L_wcsnrtombs
 807
 808 /* WARNING: We treat len as SIZE_MAX when dst is NULL! */
 809
 810 /* Note: We completely ignore ps in all currently supported conversions.
 811  * TODO: Check for valid state anyway? */
 812
 813 /* libc_hidden_proto(wcsnrtombs) */
 814 size_t wcsnrtombs(char *__restrict dst, const wchar_t **__restrict src,
 815                                         size_t NWC, size_t len, mbstate_t *__restrict ps)
 816 {
 817         const __uwchar_t *s;
 818         size_t count;
 819         int incr;
 820         char buf[MB_LEN_MAX];
 821
 822 #ifdef __CTYPE_HAS_UTF_8_LOCALES
 823         if (ENCODING == __ctype_encoding_utf8) {
 824                 return _wchar_wcsntoutf8s(dst, len, src, NWC);
 825         }
 826 #endif /* __CTYPE_HAS_UTF_8_LOCALES */
 827
 828         incr = 1;
 829         /* NOTE: The following is an AWFUL HACK!  In order to support %ls in
 830          * printf, we need to be able to compute the number of bytes needed
 831          * for the mbs conversion, not to exceed the precision specified.
 832          * But if dst is NULL, the return value is the length assuming a
 833          * sufficiently sized buffer.  So, we allow passing of (char *) src
 834          * as dst in order to flag that we really want the length, subject
 835          * to the restricted buffer size and no partial conversions.
 836          * See _wchar_wcsntoutf8s() as well. */
 837         if (!dst || (dst == ((char *) src))) {
 838                 if (!dst) {
 839                         len = SIZE_MAX;
 840                 }
 841                 dst = buf;
 842                 incr = 0;
 843         }
 844
 845         /* Since all the following encodings are single-byte encodings... */
 846         if (len > NWC) {
 847                 len = NWC;
 848         }
 849
 850         count = len;
 851         s = (const __uwchar_t *) *src;
 852
 853 #ifdef __CTYPE_HAS_8_BIT_LOCALES
 854         if (ENCODING == __ctype_encoding_8_bit) {
 855                 __uwchar_t wc;
 856                 __uwchar_t u;
 857                 while (count) {
 858                         if ((wc = *s) <= 0x7f) {
 859                                 if (!(*dst = (unsigned char) wc)) {
 860                                         s = NULL;
 861                                         break;
 862                                 }
 863                         } else {
 864                                 u = 0;
 865                                 if (wc <= Cwc2c_DOMAIN_MAX) {
 866                                         u = __UCLIBC_CURLOCALE->idx8wc2c[wc >> (Cwc2c_TI_SHIFT
 867                                                                                                                 + Cwc2c_TT_SHIFT)];
 868                                         u = __UCLIBC_CURLOCALE->tbl8wc2c[(u << Cwc2c_TI_SHIFT)
 869                                                                         + ((wc >> Cwc2c_TT_SHIFT)
 870                                                                            & ((1 << Cwc2c_TI_SHIFT)-1))];
 871                                         u = __UCLIBC_CURLOCALE->tbl8wc2c[Cwc2c_TI_LEN
 872                                                                         + (u << Cwc2c_TT_SHIFT)
 873                                                                         + (wc & ((1 << Cwc2c_TT_SHIFT)-1))];
 874                                 }
 875
 876 #ifdef __WCHAR_REPLACEMENT_CHAR
 877                                 *dst = (unsigned char) ( u ? u : __WCHAR_REPLACEMENT_CHAR );
 878 #else  /* __WCHAR_REPLACEMENT_CHAR */
 879                                 if (!u) {
 880                                         goto BAD;
 881                                 }
 882                                 *dst = (unsigned char) u;
 883 #endif /* __WCHAR_REPLACEMENT_CHAR */
 884                         }
 885                         ++s;
 886                         dst += incr;
 887                         --count;
 888                 }
 889                 if (dst != buf) {
 890                         *src = (const wchar_t *) s;
 891                 }
 892                 return len - count;
 893         }
 894 #endif /* __CTYPE_HAS_8_BIT_LOCALES */
 895
 896 #ifdef __UCLIBC_HAS_LOCALE__
 897         assert(ENCODING == __ctype_encoding_7_bit);
 898 #endif
 899
 900         while (count) {
 901                 if (*s >= 0x80) {
 902 #if defined(__CTYPE_HAS_8_BIT_LOCALES) && !defined(__WCHAR_REPLACEMENT_CHAR)
 903                 BAD:
 904 #endif
 905                         __set_errno(EILSEQ);
 906                         return (size_t) -1;
 907                 }
 908                 if ((*dst = (unsigned char) *s) == 0) {
 909                         s = NULL;
 910                         break;
 911                 }
 912                 ++s;
 913                 dst += incr;
 914                 --count;
 915         }
 916         if (dst != buf) {
 917                 *src = (const wchar_t *) s;
 918         }
 919         return len - count;
 920 }
 921 libc_hidden_def(wcsnrtombs)
 922
 923 #endif
 924 /**********************************************************************/
 925 #ifdef L_wcswidth
 926
 927 /* libc_hidden_proto(wcswidth) */
 928
 929 #ifdef __UCLIBC_MJN3_ONLY__
 930 #warning REMINDER: If we start doing translit, wcwidth and wcswidth will need updating.
 931 #warning TODO: Update wcwidth to match latest by Kuhn.
 932 #endif
 933
 934 #if defined(__UCLIBC_HAS_LOCALE__) && \
 935 ( defined(__CTYPE_HAS_8_BIT_LOCALES) || defined(__CTYPE_HAS_UTF_8_LOCALES) )
 936
 937 static const unsigned char new_idx[] = {
 938         0,    5,    5,    6,   10,   15,   28,   39,
 939         48,   48,   71,   94,  113,  128,  139,  154,
 940         175,  186,  188,  188,  188,  188,  188,  188,
 941         203,  208,  208,  208,  208,  208,  208,  208,
 942         208,  219,  219,  219,  222,  222,  222,  222,
 943         222,  222,  222,  222,  222,  222,  222,  224,
 944         224,  231,  231,  231,  231,  231,  231,  231,
 945         231,  231,  231,  231,  231,  231,  231,  231,
 946         231,  231,  231,  231,  231,  231,  231,  231,
 947         231,  231,  231,  231,  231,  231,  231,  231,
 948         231,  231,  231,  231,  231,  231,  231,  231,
 949         231,  231,  231,  231,  231,  231,  231,  231,
 950         231,  231,  231,  231,  231,  231,  231,  231,
 951         231,  231,  231,  231,  231,  231,  231,  231,
 952         231,  231,  231,  231,  231,  231,  231,  231,
 953         231,  231,  231,  231,  231,  231,  231,  231,
 954         231,  231,  231,  231,  231,  231,  231,  231,
 955         231,  231,  231,  231,  231,  231,  231,  231,
 956         231,  231,  231,  231,  231,  231,  231,  231,
 957         231,  231,  231,  231,  231,  231,  231,  231,
 958         231,  231,  231,  231,  231,  233,  233,  233,
 959         233,  233,  233,  233,  234,  234,  234,  234,
 960         234,  234,  234,  234,  234,  234,  234,  234,
 961         234,  234,  234,  234,  234,  234,  234,  234,
 962         234,  234,  234,  234,  234,  234,  234,  234,
 963         234,  234,  234,  234,  234,  234,  234,  234,
 964         234,  234,  234,  234,  234,  234,  234,  234,
 965         236,  236,  236,  236,  236,  236,  236,  236,
 966         236,  236,  236,  236,  236,  236,  236,  236,
 967         236,  236,  236,  236,  236,  236,  236,  236,
 968         236,  236,  236,  236,  236,  236,  236,  236,
 969         236,  237,  237,  238,  241,  241,  242,  249,
 970         255,
 971 };
 972
 973 static const unsigned char new_tbl[] = {
 974         0x00, 0x01, 0x20, 0x7f, 0xa0, 0x00, 0x00, 0x50,
 975         0x60, 0x70, 0x00, 0x83, 0x87, 0x88, 0x8a, 0x00,
 976         0x91, 0xa2, 0xa3, 0xba, 0xbb, 0xbe, 0xbf, 0xc0,
 977         0xc1, 0xc3, 0xc4, 0xc5, 0x00, 0x4b, 0x56, 0x70,
 978         0x71, 0xd6, 0xe5, 0xe7, 0xe9, 0xea, 0xee, 0x00,
 979         0x0f, 0x10, 0x11, 0x12, 0x30, 0x4b, 0xa6, 0xb1,
 980         0x00, 0x01, 0x03, 0x3c, 0x3d, 0x41, 0x49, 0x4d,
 981         0x4e, 0x51, 0x55, 0x62, 0x64, 0x81, 0x82, 0xbc,
 982         0xbd, 0xc1, 0xc5, 0xcd, 0xce, 0xe2, 0xe4, 0x00,
 983         0x02, 0x03, 0x3c, 0x3d, 0x41, 0x43, 0x47, 0x49,
 984         0x4b, 0x4e, 0x70, 0x72, 0x81, 0x83, 0xbc, 0xbd,
 985         0xc1, 0xc6, 0xc7, 0xc9, 0xcd, 0xce, 0x00, 0x01,
 986         0x02, 0x3c, 0x3d, 0x3f, 0x40, 0x41, 0x44, 0x4d,
 987         0x4e, 0x56, 0x57, 0x82, 0x83, 0xc0, 0xc1, 0xcd,
 988         0xce, 0x00, 0x3e, 0x41, 0x46, 0x49, 0x4a, 0x4e,
 989         0x55, 0x57, 0xbf, 0xc0, 0xc6, 0xc7, 0xcc, 0xce,
 990         0x00, 0x41, 0x44, 0x4d, 0x4e, 0xca, 0xcb, 0xd2,
 991         0xd5, 0xd6, 0xd7, 0x00, 0x31, 0x32, 0x34, 0x3b,
 992         0x47, 0x4f, 0xb1, 0xb2, 0xb4, 0xba, 0xbb, 0xbd,
 993         0xc8, 0xce, 0x00, 0x18, 0x1a, 0x35, 0x36, 0x37,
 994         0x38, 0x39, 0x3a, 0x71, 0x7f, 0x80, 0x85, 0x86,
 995         0x88, 0x90, 0x98, 0x99, 0xbd, 0xc6, 0xc7, 0x00,
 996         0x2d, 0x31, 0x32, 0x33, 0x36, 0x38, 0x39, 0x3a,
 997         0x58, 0x5a, 0x00, 0x60, 0x00, 0x12, 0x15, 0x32,
 998         0x35, 0x52, 0x54, 0x72, 0x74, 0xb7, 0xbe, 0xc6,
 999         0xc7, 0xc9, 0xd4, 0x00, 0x0b, 0x0f, 0xa9, 0xaa,
1000         0x00, 0x0b, 0x10, 0x2a, 0x2f, 0x60, 0x64, 0x6a,
1001         0x70, 0xd0, 0xeb, 0x00, 0x29, 0x2b, 0x00, 0x80,
1002         0x00, 0x2a, 0x30, 0x3f, 0x40, 0x99, 0x9b, 0x00,
1003         0xd0, 0x00, 0x00, 0xa4, 0x00, 0x00, 0x00, 0x1e,
1004         0x1f, 0x00, 0x00, 0x10, 0x20, 0x24, 0x30, 0x70,
1005         0xff, 0x00, 0x61, 0xe0, 0xe7, 0xf9, 0xfc,
1006 };
1007
1008 static const signed char new_wtbl[] = {
1009         0,   -1,    1,   -1,    1,    1,    0,    1,
1010         0,    1,    1,    0,    1,    0,    1,    1,
1011         0,    1,    0,    1,    0,    1,    0,    1,
1012         0,    1,    0,    1,    1,    0,    1,    0,
1013         1,    0,    1,    0,    1,    0,    1,    1,
1014         0,    1,    0,    1,    0,    1,    0,    1,
1015         1,    0,    1,    0,    1,    0,    1,    0,
1016         1,    0,    1,    0,    1,    0,    1,    0,
1017         1,    0,    1,    0,    1,    0,    1,    1,
1018         0,    1,    0,    1,    0,    1,    0,    1,
1019         0,    1,    0,    1,    0,    1,    0,    1,
1020         0,    1,    0,    1,    0,    1,    1,    0,
1021         1,    0,    1,    0,    1,    0,    1,    0,
1022         1,    0,    1,    0,    1,    0,    1,    0,
1023         1,    1,    0,    1,    0,    1,    0,    1,
1024         0,    1,    0,    1,    0,    1,    0,    1,
1025         1,    0,    1,    0,    1,    0,    1,    0,
1026         1,    0,    1,    1,    0,    1,    0,    1,
1027         0,    1,    0,    1,    0,    1,    0,    1,
1028         0,    1,    1,    0,    1,    0,    1,    0,
1029         1,    0,    1,    0,    1,    0,    1,    0,
1030         1,    0,    1,    0,    1,    0,    1,    1,
1031         0,    1,    0,    1,    0,    1,    0,    1,
1032         0,    1,    2,    0,    1,    0,    1,    0,
1033         1,    0,    1,    0,    1,    0,    1,    0,
1034         1,    0,    1,    1,    0,    1,    0,    1,
1035         1,    0,    1,    0,    1,    0,    1,    0,
1036         1,    0,    1,    1,    2,    1,    1,    2,
1037         2,    0,    2,    1,    2,    0,    2,    2,
1038         1,    1,    2,    1,    1,    2,    1,    0,
1039         1,    1,    0,    1,    0,    1,    2,    1,
1040         0,    2,    1,    2,    1,    0,    1,
1041 };
1042
1043 /* libc_hidden_proto(wcsnrtombs) */
1044
1045 int wcswidth(const wchar_t *pwcs, size_t n)
1046 {
1047     int h, l, m, count;
1048     wchar_t wc;
1049     unsigned char b;
1050
1051         if (ENCODING == __ctype_encoding_7_bit) {
1052                 size_t i;
1053
1054                 for (i = 0 ; (i < n) && pwcs[i] ; i++) {
1055                         if (pwcs[i] != (pwcs[i] & 0x7f)) {
1056                                 return -1;
1057                         }
1058                 }
1059         }
1060 #ifdef __CTYPE_HAS_8_BIT_LOCALES
1061         else if (ENCODING == __ctype_encoding_8_bit) {
1062                 mbstate_t mbstate;
1063
1064                 mbstate.__mask = 0;                     /* Initialize the mbstate. */
1065                 if (wcsnrtombs(NULL, &pwcs, n, SIZE_MAX, &mbstate) == ((size_t) - 1)) {
1066                         return -1;
1067                 }
1068         }
1069 #endif /* __CTYPE_HAS_8_BIT_LOCALES */
1070 #if defined(__CTYPE_HAS_UTF_8_LOCALES) && defined(KUHN)
1071         /* For stricter handling of allowed unicode values... see comments above. */
1072         else if (ENCODING == __ctype_encoding_utf8) {
1073                 size_t i;
1074
1075                 for (i = 0 ; (i < n) && pwcs[i] ; i++) {
1076                         if ( (((__uwchar_t)((pwcs[i]) - 0xfffeU)) < 2)
1077                                  || (((__uwchar_t)((pwcs[i]) - 0xd800U)) < (0xe000U - 0xd800U))
1078                                 ) {
1079                                 return -1;
1080                         }
1081                 }
1082         }
1083 #endif /* __CTYPE_HAS_UTF_8_LOCALES */
1084
1085     for (count = 0 ; n && (wc = *pwcs++) ; n--) {
1086                 if (wc <= 0xff) {
1087                         /* If we're here, wc != 0. */
1088                         if ((wc < 32) || ((wc >= 0x7f) && (wc < 0xa0))) {
1089                                 return -1;
1090                         }
1091                         ++count;
1092                         continue;
1093                 }
1094                 if (((unsigned int) wc) <= 0xffff) {
1095                         b = wc & 0xff;
1096                         h = (wc >> 8);
1097                         l = new_idx[h];
1098                         h = new_idx[h+1];
1099                         while ((m = (l+h) >> 1) != l) {
1100                                 if (b >= new_tbl[m]) {
1101                                         l = m;
1102                                 } else {                /* wc < tbl[m] */
1103                                         h = m;
1104                                 }
1105                         }
1106                         count += new_wtbl[l]; /* none should be -1. */
1107                         continue;
1108                 }
1109
1110                 /* Redo this to minimize average number of compares?*/
1111                 if (wc >= 0x1d167) {
1112                         if (wc <= 0x1d1ad) {
1113                                 if ((wc <= 0x1d169
1114                                          || (wc >= 0x1d173
1115                                                  && (wc <= 0x1d182
1116                                                          || (wc >= 0x1d185
1117                                                                  && (wc <= 0x1d18b
1118                                                                          || (wc >= 0x1d1aa))))))
1119                                         ) {
1120                                         continue;
1121                                 }
1122                         } else if (((wc >= 0xe0020) && (wc <= 0xe007f)) || (wc == 0xe0001)) {
1123                                 continue;
1124                         } else if ((wc >= 0x20000) && (wc <= 0x2ffff)) {
1125                                 ++count;                /* need 2.. add one here */
1126                         }
1127 #if (WCHAR_MAX > 0x7fffffffL)
1128                         else if (wc > 0x7fffffffL) {
1129                                 return -1;
1130                         }
1131 #endif /* (WCHAR_MAX > 0x7fffffffL) */
1132                 }
1133
1134                 ++count;
1135     }
1136
1137     return count;
1138 }
1139
1140 #else  /*  __UCLIBC_HAS_LOCALE__ */
1141
1142 int wcswidth(const wchar_t *pwcs, size_t n)
1143 {
1144         int count;
1145         wchar_t wc;
1146         size_t i;
1147
1148         for (i = 0 ; (i < n) && pwcs[i] ; i++) {
1149                 if (pwcs[i] != (pwcs[i] & 0x7f)) {
1150                         return -1;
1151                 }
1152         }
1153
1154     for (count = 0 ; n && (wc = *pwcs++) ; n--) {
1155                 if (wc <= 0xff) {
1156                         /* If we're here, wc != 0. */
1157                         if ((wc < 32) || ((wc >= 0x7f) && (wc < 0xa0))) {
1158                                 return -1;
1159                         }
1160                         ++count;
1161                         continue;
1162                 } else {
1163                         return -1;
1164                 }
1165         }
1166
1167         return count;
1168 }
1169
1170 #endif /*  __UCLIBC_HAS_LOCALE__ */
1171
1172 libc_hidden_def(wcswidth)
1173
1174 #endif
1175 /**********************************************************************/
1176 #ifdef L_wcwidth
1177
1178 /* libc_hidden_proto(wcswidth) */
1179
1180 int wcwidth(wchar_t wc)
1181 {
1182     return wcswidth(&wc, 1);
1183 }
1184
1185 #endif
1186 /**********************************************************************/
1187
1188
1189 typedef struct {
1190         mbstate_t tostate;
1191         mbstate_t fromstate;
1192         int tocodeset;
1193         int fromcodeset;
1194         int frombom;
1195         int tobom;
1196         int fromcodeset0;
1197         int frombom0;
1198         int tobom0;
1199         int skip_invalid_input;         /* To support iconv -c option. */
1200 } _UC_iconv_t;
1201
1202
1203
1204 #ifdef L_iconv
1205
1206 #include <iconv.h>
1207 #include <string.h>
1208 #include <endian.h>
1209 #include <byteswap.h>
1210
1211 #if (__BYTE_ORDER != __BIG_ENDIAN) && (__BYTE_ORDER != __LITTLE_ENDIAN)
1212 #error unsupported endianness for iconv
1213 #endif
1214
1215 #ifndef __CTYPE_HAS_8_BIT_LOCALES
1216 #error currently iconv requires 8 bit locales
1217 #endif
1218 #ifndef __CTYPE_HAS_UTF_8_LOCALES
1219 #error currently iconv requires UTF-8 locales
1220 #endif
1221
1222
1223 enum {
1224         IC_WCHAR_T = 0xe0,
1225         IC_MULTIBYTE = 0xe0,
1226 #if __BYTE_ORDER == __BIG_ENDIAN
1227         IC_UCS_4 =      0xec,
1228         IC_UTF_32 = 0xe4,
1229         IC_UCS_2 =      0xe2,
1230         IC_UTF_16 = 0xea,
1231 #else
1232         IC_UCS_4 =      0xed,
1233         IC_UTF_32 = 0xe5,
1234         IC_UCS_2 =      0xe3,
1235         IC_UTF_16 = 0xeb,
1236 #endif
1237         IC_UTF_8 = 2,
1238         IC_ASCII = 1
1239 };
1240
1241 /* For the multibyte
1242  * bit 0 means swap endian
1243  * bit 1 means 2 byte
1244  * bit 2 means 4 byte
1245  *
1246  */
1247
1248 /* Used externally only by iconv utility */
1249 extern const unsigned char __iconv_codesets[];
1250 libc_hidden_proto(__iconv_codesets)
1251
1252 const unsigned char __iconv_codesets[] =
1253         "\x0a\xe0""WCHAR_T\x00"         /* superset of UCS-4 but platform-endian */
1254 #if __BYTE_ORDER == __BIG_ENDIAN
1255         "\x08\xec""UCS-4\x00"           /* always BE */
1256         "\x0a\xec""UCS-4BE\x00"
1257         "\x0a\xed""UCS-4LE\x00"
1258         "\x09\xe4""UTF-32\x00"          /* platform endian with BOM */
1259         "\x0b\xe4""UTF-32BE\x00"
1260         "\x0b\xe5""UTF-32LE\x00"
1261         "\x08\xe2""UCS-2\x00"           /* always BE */
1262         "\x0a\xe2""UCS-2BE\x00"
1263         "\x0a\xe3""UCS-2LE\x00"
1264         "\x09\xea""UTF-16\x00"          /* platform endian with BOM */
1265         "\x0b\xea""UTF-16BE\x00"
1266         "\x0b\xeb""UTF-16LE\x00"
1267 #elif __BYTE_ORDER == __LITTLE_ENDIAN
1268         "\x08\xed""UCS-4\x00"           /* always BE */
1269         "\x0a\xed""UCS-4BE\x00"
1270         "\x0a\xec""UCS-4LE\x00"
1271         "\x09\xf4""UTF-32\x00"          /* platform endian with BOM */
1272         "\x0b\xe5""UTF-32BE\x00"
1273         "\x0b\xe4""UTF-32LE\x00"
1274         "\x08\xe3""UCS-2\x00"           /* always BE */
1275         "\x0a\xe3""UCS-2BE\x00"
1276         "\x0a\xe2""UCS-2LE\x00"
1277         "\x09\xfa""UTF-16\x00"          /* platform endian with BOM */
1278         "\x0b\xeb""UTF-16BE\x00"
1279         "\x0b\xea""UTF-16LE\x00"
1280 #endif
1281         "\x08\x02""UTF-8\x00"
1282         "\x0b\x01""US-ASCII\x00"
1283         "\x07\x01""ASCII";                      /* Must be last! (special case to save a nul) */
1284 libc_hidden_data_def(__iconv_codesets)
1285
1286 /* Experimentally off - libc_hidden_proto(strcasecmp) */
1287
1288 static int find_codeset(const char *name)
1289 {
1290         const unsigned char *s;
1291         int codeset;
1292
1293         for (s = __iconv_codesets; *s; s += *s) {
1294                 if (!strcasecmp((char*) (s + 2), name)) {
1295                         return s[1];
1296                 }
1297         }
1298
1299         /* The following is ripped from find_locale in locale.c. */
1300
1301         /* TODO: maybe CODESET_LIST + *s ??? */
1302         /* 7bit is 1, UTF-8 is 2, 8-bit is >= 3 */
1303         codeset = 2;
1304         s = (const unsigned char *) __LOCALE_DATA_CODESET_LIST;
1305         do {
1306                 ++codeset;              /* Increment codeset first. */
1307                 if (!strcasecmp(__LOCALE_DATA_CODESET_LIST+*s, name)) {
1308                         return codeset;
1309                 }
1310         } while (*++s);
1311
1312         return 0;                       /* No matching codeset! */
1313 }
1314
1315 iconv_t weak_function iconv_open(const char *tocode, const char *fromcode)
1316 {
1317         register _UC_iconv_t *px;
1318         int tocodeset, fromcodeset;
1319
1320         if (((tocodeset = find_codeset(tocode)) != 0)
1321                 && ((fromcodeset = find_codeset(fromcode)) != 0)) {
1322                 if ((px = malloc(sizeof(_UC_iconv_t))) != NULL) {
1323                         px->tocodeset = tocodeset;
1324                         px->tobom0 = px->tobom = (tocodeset & 0x10) >> 4;
1325                         px->fromcodeset0 = px->fromcodeset = fromcodeset;
1326                         px->frombom0 = px->frombom = (fromcodeset & 0x10) >> 4;
1327                         px->skip_invalid_input = px->tostate.__mask
1328                                 = px->fromstate.__mask = 0;
1329                         return (iconv_t) px;
1330                 }
1331         } else {
1332                 __set_errno(EINVAL);
1333         }
1334         return (iconv_t)(-1);
1335 }
1336
1337 int weak_function iconv_close(iconv_t cd)
1338 {
1339         free(cd);
1340
1341         return 0;
1342 }
1343
1344 size_t weak_function iconv(iconv_t cd, char **__restrict inbuf,
1345                                                    size_t *__restrict inbytesleft,
1346                                                    char **__restrict outbuf,
1347                                                    size_t *__restrict outbytesleft)
1348 {
1349         _UC_iconv_t *px = (_UC_iconv_t *) cd;
1350         size_t nrcount, r;
1351         wchar_t wc, wc2;
1352         int inci, inco;
1353
1354         assert(px != (_UC_iconv_t *)(-1));
1355         assert(sizeof(wchar_t) == 4);
1356
1357         if (!inbuf || !*inbuf) {        /* Need to reinitialze conversion state. */
1358                 /* Note: For shift-state encodings we possibly need to output the
1359                  * shift sequence to return to initial state! */
1360                 if ((px->fromcodeset & 0xf0) == 0xe0) {
1361                 }
1362                 px->tostate.__mask = px->fromstate.__mask = 0;
1363                 px->fromcodeset = px->fromcodeset0;
1364                 px->tobom = px->tobom0;
1365                 px->frombom = px->frombom0;
1366                 return 0;
1367         }
1368
1369         nrcount = 0;
1370         while (*inbytesleft) {
1371                 if (!*outbytesleft) {
1372                 TOO_BIG:
1373                         __set_errno(E2BIG);
1374                         return (size_t) -1;
1375                 }
1376
1377                 inci = inco = 1;
1378                 if (px->fromcodeset >= IC_MULTIBYTE) {
1379                         inci = (px->fromcodeset == IC_WCHAR_T) ? 4: (px->fromcodeset & 6);
1380                         if (*inbytesleft < inci) goto INVALID;
1381                         wc = (((unsigned int)((unsigned char)((*inbuf)[0]))) << 8)
1382                                 + ((unsigned char)((*inbuf)[1]));
1383                         if (inci == 4) {
1384                                 wc = (((unsigned int)((unsigned char)((*inbuf)[2]))) << 8)
1385                                         + ((unsigned char)((*inbuf)[3])) + (wc << 16);
1386                                 if (!(px->fromcodeset & 1)) wc = bswap_32(wc);
1387                         } else {
1388                                 if (!(px->fromcodeset & 1)) wc = bswap_16(wc);
1389                                 if (((px->fromcodeset & IC_UTF_16) == IC_UTF_16)
1390                                          && (((__uwchar_t)(wc - 0xd800U)) < (0xdc00U - 0xd800U))
1391                                         ) {                     /* surrogate */
1392                                         wc =- 0xd800U;
1393                                         if (*inbytesleft < 4) goto INVALID;
1394                                         wc2 = (((unsigned int)((unsigned char)((*inbuf)[2]))) << 8)
1395                                                 + ((unsigned char)((*inbuf)[3]));
1396                                         if (!(px->fromcodeset & 1)) wc = bswap_16(wc2);
1397                                         if (((__uwchar_t)(wc2 -= 0xdc00U)) < (0xe0000U - 0xdc00U)) {
1398                                                 goto ILLEGAL;
1399                                         }
1400                                         inci = 4;       /* Change inci here in case skipping illegals. */
1401                                         wc = 0x10000UL + (wc << 10) + wc2;
1402                                 }
1403                         }
1404
1405                         if (px->frombom) {
1406                                 px->frombom = 0;
1407                                 if ((wc == 0xfeffU)
1408                                         || (wc == ((inci == 4)
1409                                                            ? (((wchar_t) 0xfffe0000UL))
1410                                                            : ((wchar_t)(0xfffeUL))))
1411                                         ) {
1412                                         if (wc != 0xfeffU) {
1413                                                 px->fromcodeset ^= 1; /* toggle endianness */
1414                                                 wc = 0xfeffU;
1415                                         }
1416                                         if (!px->frombom) {
1417                                                 goto BOM_SKIP_OUTPUT;
1418                                         }
1419                                         goto GOT_BOM;
1420                                 }
1421                         }
1422
1423                         if (px->fromcodeset != IC_WCHAR_T) {
1424                                 if (((__uwchar_t) wc) > (((px->fromcodeset & IC_UCS_4) == IC_UCS_4)
1425                                                                                  ? 0x7fffffffUL : 0x10ffffUL)
1426 #ifdef KUHN
1427                                         || (((__uwchar_t)(wc - 0xfffeU)) < 2)
1428                                         || (((__uwchar_t)(wc - 0xd800U)) < (0xe000U - 0xd800U))
1429 #endif
1430                                         ) {
1431                                         goto ILLEGAL;
1432                                 }
1433                         }
1434                 } else if (px->fromcodeset == IC_UTF_8) {
1435                         const char *p = *inbuf;
1436                         r = _wchar_utf8sntowcs(&wc, 1, &p, *inbytesleft, &px->fromstate, 0);
1437                         if (((ssize_t) r) <= 0) { /* either EILSEQ or incomplete or nul */
1438                                 if (((ssize_t) r) < 0) { /* either EILSEQ or incomplete or nul */
1439                                         assert((r == (size_t)(-1)) || (r == (size_t)(-2)));
1440                                         if (r == (size_t)(-2)) {
1441                                         INVALID:
1442                                                 __set_errno(EINVAL);
1443                                         } else {
1444                                                 px->fromstate.__mask = 0;
1445                                                 inci = 1;
1446                                         ILLEGAL:
1447                                                 if (px->skip_invalid_input) {
1448                                                         px->skip_invalid_input = 2;     /* flag for iconv utility */
1449                                                         goto BOM_SKIP_OUTPUT;
1450                                                 }
1451                                                 __set_errno(EILSEQ);
1452                                         }
1453                                         return (size_t)(-1);
1454                                 }
1455 #ifdef __UCLIBC_MJN3_ONLY__
1456 #warning TODO: optimize this.
1457 #endif
1458                                 if (p != NULL) { /* incomplete char case */
1459                                         goto INVALID;
1460                                 }
1461                                 p = *inbuf + 1; /* nul */
1462                         }
1463                         inci = p - *inbuf;
1464                 } else if ((wc = ((unsigned char)(**inbuf))) >= 0x80) { /* Non-ASCII... */
1465                         if (px->fromcodeset == IC_ASCII) { /* US-ASCII codeset */
1466                                 goto ILLEGAL;
1467                         } else {                        /* some other 8-bit ascii-extension codeset */
1468                                 const __codeset_8_bit_t *c8b
1469                                         = __locale_mmap->codeset_8_bit + px->fromcodeset - 3;
1470                                 wc -= 0x80;
1471                                 wc = __UCLIBC_CURLOCALE->tbl8c2wc[
1472                                                          (c8b->idx8c2wc[wc >> Cc2wc_IDX_SHIFT]
1473                                                           << Cc2wc_IDX_SHIFT) + (wc & (Cc2wc_ROW_LEN - 1))];
1474                                 if (!wc) {
1475                                         goto ILLEGAL;
1476                                 }
1477                         }
1478                 }
1479
1480
1481                 if (px->tobom) {
1482                         inci = 0;
1483                         wc = 0xfeffU;
1484         GOT_BOM:
1485                         px->tobom = 0;
1486                 }
1487
1488                 if (px->tocodeset >= IC_MULTIBYTE) {
1489                         inco = (px->tocodeset == IC_WCHAR_T) ? 4: (px->tocodeset & 6);
1490                         if (*outbytesleft < inco) goto TOO_BIG;
1491                         if (px->tocodeset != IC_WCHAR_T) {
1492                                 if (((__uwchar_t) wc) > (((px->tocodeset & IC_UCS_4) == IC_UCS_4)
1493                                                                                  ? 0x7fffffffUL : 0x10ffffUL)
1494 #ifdef KUHN
1495                                         || (((__uwchar_t)(wc - 0xfffeU)) < 2)
1496                                         || (((__uwchar_t)(wc - 0xd800U)) < (0xe000U - 0xd800U))
1497 #endif
1498                                         ) {
1499                                 REPLACE_32:
1500                                         wc = 0xfffd;
1501                                         ++nrcount;
1502                                 }
1503                         }
1504                         if (inco == 4) {
1505                                 if (px->tocodeset & 1) wc = bswap_32(wc);
1506                         } else {
1507                                 if (((__uwchar_t)wc ) > 0xffffU) {
1508                                         if ((px->tocodeset & IC_UTF_16) != IC_UTF_16) {
1509                                                 goto REPLACE_32;
1510                                         }
1511                                         if (*outbytesleft < (inco = 4)) goto TOO_BIG;
1512                                         wc2 = 0xdc00U + (wc & 0x3ff);
1513                                         wc = 0xd800U + ((wc >> 10) & 0x3ff);
1514                                         if (px->tocodeset & 1) {
1515                                                 wc = bswap_16(wc);
1516                                                 wc2 = bswap_16(wc2);
1517                                         }
1518                                         wc += (wc2 << 16);
1519                                 } else if (px->tocodeset & 1) wc = bswap_16(wc);
1520                         }
1521                         (*outbuf)[0] = (char)((unsigned char)(wc));
1522                         (*outbuf)[1] = (char)((unsigned char)(wc >> 8));
1523                         if (inco == 4) {
1524                                 (*outbuf)[2] = (char)((unsigned char)(wc >> 16));
1525                                 (*outbuf)[3] = (char)((unsigned char)(wc >> 24));
1526                         }
1527                 } else if (px->tocodeset == IC_UTF_8) {
1528                         const wchar_t *pw = &wc;
1529                         do {
1530                                 r = _wchar_wcsntoutf8s(*outbuf, *outbytesleft, &pw, 1);
1531                                 if (r != (size_t)(-1)) {
1532 #ifdef __UCLIBC_MJN3_ONLY__
1533 #warning TODO: What happens for a nul?
1534 #endif
1535                                         if (r == 0) {
1536                                                 if (wc != 0) {
1537                                                         goto TOO_BIG;
1538                                                 }
1539                                                 ++r;
1540                                         }
1541                                         break;
1542                                 }
1543                                 wc = 0xfffdU;
1544                                 ++nrcount;
1545                         } while (1);
1546                         inco = r;
1547                 } else if (((__uwchar_t)(wc)) < 0x80) {
1548                 CHAR_GOOD:
1549                                 **outbuf = wc;
1550                 } else {
1551                         if ((px->tocodeset != 0x01) && (wc <= Cwc2c_DOMAIN_MAX)) {
1552                                 const __codeset_8_bit_t *c8b
1553                                         = __locale_mmap->codeset_8_bit + px->tocodeset - 3;
1554                                 __uwchar_t u;
1555                                 u = c8b->idx8wc2c[wc >> (Cwc2c_TI_SHIFT + Cwc2c_TT_SHIFT)];
1556                                 u = __UCLIBC_CURLOCALE->tbl8wc2c[(u << Cwc2c_TI_SHIFT)
1557                                                  + ((wc >> Cwc2c_TT_SHIFT)
1558                                                         & ((1 << Cwc2c_TI_SHIFT)-1))];
1559                                 wc = __UCLIBC_CURLOCALE->tbl8wc2c[Cwc2c_TI_LEN
1560                                                  + (u << Cwc2c_TT_SHIFT)
1561                                                  + (wc & ((1 << Cwc2c_TT_SHIFT)-1))];
1562                                 if (wc) {
1563                                         goto CHAR_GOOD;
1564                                 }
1565                         }
1566                         **outbuf = '?';
1567                         ++nrcount;
1568                 }
1569
1570                 *outbuf += inco;
1571                 *outbytesleft -= inco;
1572         BOM_SKIP_OUTPUT:
1573                 *inbuf += inci;
1574                 *inbytesleft -= inci;
1575         }
1576         return nrcount;
1577 }
1578
1579 #endif
1580 /**********************************************************************/
1581 #ifdef L_iconv_main
1582
1583 #include <string.h>
1584 #include <iconv.h>
1585 #include <stdarg.h>
1586 #include <libgen.h>
1587
1588 extern const unsigned char __iconv_codesets[];
1589
1590 #define IBUF BUFSIZ
1591 #define OBUF BUFSIZ
1592
1593 static char *progname;
1594 static int hide_errors;
1595
1596 static void error_msg(const char *fmt, ...)
1597          __attribute__ ((noreturn, format (printf, 1, 2)));
1598
1599 static void error_msg(const char *fmt, ...)
1600 {
1601         va_list arg;
1602
1603         if (!hide_errors) {
1604                 fprintf(stderr, "%s: ", progname);
1605                 va_start(arg, fmt);
1606                 vfprintf(stderr, fmt, arg);
1607                 va_end(arg);
1608         }
1609
1610         exit(EXIT_FAILURE);
1611 }
1612
1613 int main(int argc, char **argv)
1614 {
1615         FILE *ifile;
1616         FILE *ofile = stdout;
1617         const char *p;
1618         const char *s;
1619         static const char opt_chars[] = "tfocsl";
1620                                       /* 012345 */
1621         const char *opts[sizeof(opt_chars)]; /* last is infile name */
1622         iconv_t ic;
1623         char ibuf[IBUF];
1624         char obuf[OBUF];
1625         char *pi;
1626         char *po;
1627         size_t ni, no, r, pos;
1628
1629         hide_errors = 0;
1630
1631         for (s = opt_chars ; *s ; s++) {
1632                 opts[ s - opt_chars ] = NULL;
1633         }
1634
1635         progname = *argv;
1636         while (--argc) {
1637                 p = *++argv;
1638                 if ((*p != '-') || (*++p == 0)) {
1639                         break;
1640                 }
1641                 do {
1642                         if ((s = strchr(opt_chars,*p)) == NULL) {
1643                         USAGE:
1644                                 s = basename(progname);
1645                                 fprintf(stderr,
1646                                                 "%s [-cs] -f fromcode -t tocode [-o outputfile] [inputfile ...]\n"
1647                                                 "  or\n%s -l\n", s, s);
1648                                 return EXIT_FAILURE;
1649                         }
1650                         if ((s - opt_chars) < 3) {
1651                                 if ((--argc == 0) || opts[s - opt_chars]) {
1652                                         goto USAGE;
1653                                 }
1654                                 opts[s - opt_chars] = *++argv;
1655                         } else {
1656                                 opts[s - opt_chars] = p;
1657                         }
1658                 } while (*++p);
1659         }
1660
1661         if (opts[5]) {                          /* -l */
1662                 fprintf(stderr, "Recognized codesets:\n");
1663                 for (s = (char *)__iconv_codesets ; *s ; s += *s) {
1664                         fprintf(stderr,"  %s\n", s+2);
1665                 }
1666                 s = __LOCALE_DATA_CODESET_LIST;
1667                 do {
1668                         fprintf(stderr,"  %s\n", __LOCALE_DATA_CODESET_LIST+ (unsigned char)(*s));
1669                 } while (*++s);
1670
1671                 return EXIT_SUCCESS;
1672         }
1673
1674         if (opts[4]) {
1675                 hide_errors = 1;
1676         }
1677
1678         if (!opts[0] || !opts[1]) {
1679                 goto USAGE;
1680         }
1681         if ((ic = iconv_open(opts[0],opts[1])) == ((iconv_t)(-1))) {
1682                 error_msg( "unsupported codeset in %s -> %s conversion\n", opts[0], opts[1]);
1683         }
1684         if (opts[3]) {                          /* -c */
1685                 ((_UC_iconv_t *) ic)->skip_invalid_input = 1;
1686         }
1687
1688         if ((s = opts[2]) != NULL) {
1689                 if (!(ofile = fopen(s, "w"))) {
1690                         error_msg( "couldn't open %s for writing\n", s);
1691                 }
1692         }
1693
1694         pos = ni = 0;
1695         do {
1696                 if (!argc || ((**argv == '-') && !((*argv)[1]))) {
1697                         ifile = stdin;          /* we don't check for duplicates */
1698                 } else if (!(ifile = fopen(*argv, "r"))) {
1699                         error_msg( "couldn't open %s for reading\n", *argv);
1700                 }
1701
1702                 while ((r = fread(ibuf + ni, 1, IBUF - ni, ifile)) > 0) {
1703                         pos += r;
1704                         ni += r;
1705                         no = OBUF;
1706                         pi = ibuf;
1707                         po = obuf;
1708                         if ((r = iconv(ic, &pi, &ni, &po, &no)) == ((size_t)(-1))) {
1709                                 if ((errno != EINVAL) && (errno != E2BIG)) {
1710                                         error_msg( "iconv failed at pos %lu : %m\n", (unsigned long) (pos - ni));
1711                                 }
1712                         }
1713                         if ((r = OBUF - no) > 0) {
1714                                 if (fwrite(obuf, 1, OBUF - no, ofile) < r) {
1715                                         error_msg( "write error\n");
1716                                 }
1717                         }
1718                         if (ni) {                       /* still bytes in buffer! */
1719                                 memmove(ibuf, pi, ni);
1720                         }
1721                 }
1722
1723                 if (ferror(ifile)) {
1724                         error_msg( "read error\n");
1725                 }
1726
1727                 ++argv;
1728
1729                 if (ifile != stdin) {
1730                         fclose(ifile);
1731                 }
1732
1733         } while (--argc > 0);
1734
1735         iconv_close(ic);
1736
1737         if (ni) {
1738                 error_msg( "incomplete sequence\n");
1739         }
1740
1741         return (((_UC_iconv_t *) ic)->skip_invalid_input < 2)
1742                 ? EXIT_SUCCESS : EXIT_FAILURE;
1743 }
1744
1745 #endif
1746 /**********************************************************************/