winsup/cygwin/strfuncs.cc

   1 /* strfuncs.cc: misc funcs that don't belong anywhere else
   2
   3    Copyright 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
   4    2005, 2006, 2007, 2008, 2009 Red Hat, Inc.
   5
   6 This file is part of Cygwin.
   7
   8 This software is a copyrighted work licensed under the terms of the
   9 Cygwin license.  Please consult the file "CYGWIN_LICENSE" for
  10 details. */
  11
  12 #include "winsup.h"
  13 #include <stdlib.h>
  14 #include <wchar.h>
  15 #include <winnls.h>
  16 #include <ntdll.h>
  17 #include "cygerrno.h"
  18 #include "security.h"
  19 #include "path.h"
  20 #include "fhandler.h"
  21 #include "dtable.h"
  22 #include "cygheap.h"
  23 #include "tls_pbuf.h"
  24
  25 /* Transform characters invalid for Windows filenames to the Unicode private
  26    use area in the U+f0XX range.  The affected characters are all control
  27    chars 1 <= c <= 31, as well as the characters " * : < > ? |.  The backslash
  28    is affected as well, but we can't transform it as long as we accept Win32
  29    paths as input.
  30    The reverse functionality is in function sys_cp_wcstombs. */
  31 static const WCHAR tfx_chars[] = {
  32             0, 0xf000 |   1, 0xf000 |   2, 0xf000 |   3,
  33  0xf000 |   4, 0xf000 |   5, 0xf000 |   6, 0xf000 |   7,
  34  0xf000 |   8, 0xf000 |   9, 0xf000 |  10, 0xf000 |  11,
  35  0xf000 |  12, 0xf000 |  13, 0xf000 |  14, 0xf000 |  15,
  36  0xf000 |  16, 0xf000 |  17, 0xf000 |  18, 0xf000 |  19,
  37  0xf000 |  20, 0xf000 |  21, 0xf000 |  22, 0xf000 |  23,
  38  0xf000 |  24, 0xf000 |  25, 0xf000 |  26, 0xf000 |  27,
  39  0xf000 |  28, 0xf000 |  29, 0xf000 |  30, 0xf000 |  31,
  40           ' ',          '!', 0xf000 | '"',          '#',
  41           '$',          '%',          '&',           39,
  42           '(',          ')', 0xf000 | '*',          '+',
  43           ',',          '-',          '.',          '\\',
  44           '0',          '1',          '2',          '3',
  45           '4',          '5',          '6',          '7',
  46           '8',          '9', 0xf000 | ':',          ';',
  47  0xf000 | '<',          '=', 0xf000 | '>', 0xf000 | '?',
  48           '@',          'A',          'B',          'C',
  49           'D',          'E',          'F',          'G',
  50           'H',          'I',          'J',          'K',
  51           'L',          'M',          'N',          'O',
  52           'P',          'Q',          'R',          'S',
  53           'T',          'U',          'V',          'W',
  54           'X',          'Y',          'Z',          '[',
  55           '\\',          ']',          '^',          '_',
  56           '`',          'a',          'b',          'c',
  57           'd',          'e',          'f',          'g',
  58           'h',          'i',          'j',          'k',
  59           'l',          'm',          'n',          'o',
  60           'p',          'q',          'r',          's',
  61           't',          'u',          'v',          'w',
  62           'x',          'y',          'z',          '{',
  63  0xf000 | '|',          '}',          '~',          127
  64 };
  65
  66 void
  67 transform_chars (PWCHAR path, PWCHAR path_end)
  68 {
  69   for (; path <= path_end; ++path)
  70     if (*path < 128)
  71       *path = tfx_chars[*path];
  72 }
  73
  74 /* The SJIS, JIS and eucJP conversion in newlib does not use UTF as
  75    wchar_t character representation.  That's unfortunate for us since
  76    we require UTF for the OS.  What we do here is to have our own
  77    implementation of the base functions for the conversion using
  78    the MulitByteToWideChar/WideCharToMultiByte functions. */
  79
  80 /* FIXME: We can't support JIS (ISO-2022-JP) at all right now.  It's a
  81    stateful charset encoding.  The translation from mbtowc to
  82    MulitByteToWideChar is quite complex.  Given that we support SJIS and
  83    eucJP, the both most used Japanese charset encodings, this shouldn't
  84    be such a big problem. */
  85
  86 /* GBK, eucKR, and Big5 conversions are not available so far in newlib. */
  87
  88 static int
  89 __db_wctomb (struct _reent *r, char *s, wchar_t wchar, UINT cp)
  90 {
  91   if (s == NULL)
  92     return 0;
  93
  94   if (wchar < 0x80)
  95     {
  96       *s = (char) wchar;
  97       return 1;
  98     }
  99
 100   BOOL def_used = false;
 101   int ret = WideCharToMultiByte (cp, WC_NO_BEST_FIT_CHARS, &wchar, 1, s,
 102                                  2, NULL, &def_used);
 103   if (ret > 0 && !def_used)
 104     return ret;
 105
 106   r->_errno = EILSEQ;
 107   return -1;
 108 }
 109
 110 extern "C" int
 111 __sjis_wctomb (struct _reent *r, char *s, wchar_t wchar, const char *charset,
 112                mbstate_t *state)
 113 {
 114   return __db_wctomb (r,s, wchar, 932);
 115 }
 116
 117 extern "C" int
 118 __jis_wctomb (struct _reent *r, char *s, wchar_t wchar, const char *charset,
 119                mbstate_t *state)
 120 {
 121   /* FIXME: See comment at start of file. */
 122   return __ascii_wctomb (r, s, wchar, charset, state);
 123 }
 124
 125 extern "C" int
 126 __eucjp_wctomb (struct _reent *r, char *s, wchar_t wchar, const char *charset,
 127                mbstate_t *state)
 128 {
 129   /* Unfortunately, the Windows eucJP codepage 20932 is not really 100%
 130      compatible to eucJP.  It's a cute approximation which makes it a
 131      doublebyte codepage.
 132      The JIS-X-0212 three byte codes (0x8f,0xa1-0xfe,0xa1-0xfe) are folded
 133      into two byte codes as follows: The 0x8f is stripped, the next byte is
 134      taken as is, the third byte is mapped into the lower 7-bit area by
 135      masking it with 0x7f.  So, for instance, the eucJP code 0x8f,0xdd,0xf8
 136      becomes 0xdd,0x78 in CP 20932.
 137
 138      To be really eucJP compatible, we have to map the JIS-X-0212 characters
 139      between CP 20932 and eucJP ourselves. */
 140   if (s == NULL)
 141     return 0;
 142
 143   if (wchar < 0x80)
 144     {
 145       *s = (char) wchar;
 146       return 1;
 147     }
 148
 149   BOOL def_used = false;
 150   int ret = WideCharToMultiByte (20932, WC_NO_BEST_FIT_CHARS, &wchar, 1, s,
 151                                  3, NULL, &def_used);
 152   if (ret > 0 && !def_used)
 153     {
 154       /* CP20932 representation of JIS-X-0212 character? */
 155       if (ret == 2 && (unsigned char) s[1] <= 0x7f)
 156         {
 157           /* Yes, convert to eucJP three byte sequence */
 158           s[2] = s[1] | 0x80;
 159           s[1] = s[0];
 160           s[0] = 0x8f;
 161           ++ret;
 162         }
 163       return ret;
 164     }
 165
 166   r->_errno = EILSEQ;
 167   return -1;
 168 }
 169
 170 extern "C" int
 171 __gbk_wctomb (struct _reent *r, char *s, wchar_t wchar, const char *charset,
 172                mbstate_t *state)
 173 {
 174   return __db_wctomb (r,s, wchar, 936);
 175 }
 176
 177 extern "C" int
 178 __kr_wctomb (struct _reent *r, char *s, wchar_t wchar, const char *charset,
 179                mbstate_t *state)
 180 {
 181   return __db_wctomb (r,s, wchar, 949);
 182 }
 183
 184 extern "C" int
 185 __big5_wctomb (struct _reent *r, char *s, wchar_t wchar, const char *charset,
 186                mbstate_t *state)
 187 {
 188   return __db_wctomb (r,s, wchar, 950);
 189 }
 190
 191 static int
 192 __db_mbtowc (struct _reent *r, wchar_t *pwc, const char *s, size_t n, UINT cp,
 193              mbstate_t *state)
 194 {
 195   wchar_t dummy;
 196   int ret;
 197
 198   if (s == NULL)
 199     return 0;  /* not state-dependent */
 200
 201   if (n == 0)
 202     return -2;
 203
 204   if (pwc == NULL)
 205     pwc = &dummy;
 206
 207   if (state->__count == 0)
 208     {
 209       if (*(unsigned char *) s < 0x80)
 210         {
 211           *pwc = *(unsigned char *) s;
 212           return *s ? 1 : 0;
 213         }
 214       size_t cnt = min (n, 2);
 215       ret = MultiByteToWideChar (cp, MB_ERR_INVALID_CHARS, s, cnt, pwc, 1);
 216       if (ret)
 217         return cnt;
 218       if (n == 1)
 219         {
 220           state->__count = n;
 221           state->__value.__wchb[0] = *s;
 222           return -2;
 223         }
 224       /* These Win32 functions are really crappy.  Assuming n is 2 but the
 225          first byte is a singlebyte charcode, the function does not convert
 226          that byte and return 1, rather it just returns 0.  So, what we do
 227          here is to check if the first byte returns a valid value... */
 228       else if (MultiByteToWideChar (cp, MB_ERR_INVALID_CHARS, s, 1, pwc, 1))
 229         return 1;
 230       r->_errno = EILSEQ;
 231       return -1;
 232     }
 233   state->__value.__wchb[state->__count] = *s;
 234   ret = MultiByteToWideChar (cp, MB_ERR_INVALID_CHARS,
 235                              (const char *) state->__value.__wchb, 2, pwc, 1);
 236   if (!ret)
 237     {
 238       r->_errno = EILSEQ;
 239       return -1;
 240     }
 241   state->__count = 0;
 242   return 1;
 243 }
 244
 245 extern "C" int
 246 __sjis_mbtowc (struct _reent *r, wchar_t *pwc, const char *s, size_t n,
 247                const char *charset, mbstate_t *state)
 248 {
 249   return __db_mbtowc (r, pwc, s, n, 932, state);
 250 }
 251
 252 extern "C" int
 253 __jis_mbtowc (struct _reent *r, wchar_t *pwc, const char *s, size_t n,
 254                const char *charset, mbstate_t *state)
 255 {
 256   /* FIXME: See comment at start of file. */
 257   return __ascii_mbtowc (r, pwc, s, n, charset, state);
 258 }
 259
 260 extern "C" int
 261 __eucjp_mbtowc (struct _reent *r, wchar_t *pwc, const char *s, size_t n,
 262                 const char *charset, mbstate_t *state)
 263 {
 264   /* See comment in __eucjp_wctomb above. */
 265   wchar_t dummy;
 266   int ret = 0;
 267
 268   if (s == NULL)
 269     return 0;  /* not state-dependent */
 270
 271   if (n == 0)
 272     return -2;
 273
 274   if (pwc == NULL)
 275     pwc = &dummy;
 276
 277   if (state->__count == 0)
 278     {
 279       if (*(unsigned char *) s < 0x80)
 280         {
 281           *pwc = *(unsigned char *) s;
 282           return *s ? 1 : 0;
 283         }
 284       if (*(unsigned char *) s == 0x8f) /* JIS-X-0212 lead byte? */
 285         {
 286           /* Yes.  Store sequence in mbstate and handle in the __count != 0
 287              case at the end of the function. */
 288           size_t i;
 289           for (i = 0; i < 3 && i < n; i++)
 290             state->__value.__wchb[i] = s[i];
 291           if ((state->__count = i) < 3) /* Incomplete sequence? */
 292             return -2;
 293           ret = 3;
 294           goto jis_x_0212;
 295         }
 296       size_t cnt = min (n, 2);
 297       if (MultiByteToWideChar (20932, MB_ERR_INVALID_CHARS, s, cnt, pwc, 1))
 298         return cnt;
 299       if (n == 1)
 300         {
 301           state->__count = 1;
 302           state->__value.__wchb[0] = *s;
 303           return -2;
 304         }
 305       else if (MultiByteToWideChar (20932, MB_ERR_INVALID_CHARS, s, 1, pwc, 1))
 306         return 1;
 307       r->_errno = EILSEQ;
 308       return -1;
 309     }
 310   state->__value.__wchb[state->__count++] = *s;
 311   ret = 1;
 312 jis_x_0212:
 313   if (state->__value.__wchb[0] == 0x8f)
 314     {
 315       if (state->__count == 2)
 316         {
 317           if (n == 1)
 318             return -2;
 319           state->__value.__wchb[state->__count] = s[1];
 320           ret = 2;
 321         }
 322       /* Ok, we have a full JIS-X-0212 sequence in mbstate.  Convert it
 323          to the CP 20932 representation and feed it to MultiByteToWideChar. */
 324       state->__value.__wchb[0] = state->__value.__wchb[1];
 325       state->__value.__wchb[1] = state->__value.__wchb[2] & 0x7f;
 326     }
 327   if (!MultiByteToWideChar (20932, MB_ERR_INVALID_CHARS,
 328                             (const char *) state->__value.__wchb, 2, pwc, 1))
 329     {
 330       r->_errno = EILSEQ;
 331       return -1;
 332     }
 333   state->__count = 0;
 334   return ret;
 335 }
 336
 337 extern "C" int
 338 __gbk_mbtowc (struct _reent *r, wchar_t *pwc, const char *s, size_t n,
 339                const char *charset, mbstate_t *state)
 340 {
 341   return __db_mbtowc (r, pwc, s, n, 936, state);
 342 }
 343
 344 extern "C" int
 345 __kr_mbtowc (struct _reent *r, wchar_t *pwc, const char *s, size_t n,
 346                const char *charset, mbstate_t *state)
 347 {
 348   return __db_mbtowc (r, pwc, s, n, 949, state);
 349 }
 350
 351 extern "C" int
 352 __big5_mbtowc (struct _reent *r, wchar_t *pwc, const char *s, size_t n,
 353                const char *charset, mbstate_t *state)
 354 {
 355   return __db_mbtowc (r, pwc, s, n, 950, state);
 356 }
 357
 358 /* Convert Windows codepage to a setlocale compatible character set code.
 359    Called from newlib's setlocale() with codepage set to 0, if the
 360    charset isn't given explicitely in the POSIX compatible locale specifier.
 361    The function also returns a pointer to the corresponding _mbtowc_r
 362    function. */
 363 extern "C" mbtowc_p
 364 __set_charset_from_codepage (UINT cp, char *charset)
 365 {
 366   if (cp == 0)
 367     cp = GetACP ();
 368   switch (cp)
 369     {
 370     case 437:
 371     case 720:
 372     case 737:
 373     case 775:
 374     case 850:
 375     case 852:
 376     case 855:
 377     case 857:
 378     case 858:
 379     case 862:
 380     case 866:
 381     case 874:
 382     case 1125:
 383     case 1250:
 384     case 1251:
 385     case 1252:
 386     case 1253:
 387     case 1254:
 388     case 1255:
 389     case 1256:
 390     case 1257:
 391     case 1258:
 392     case 20866:
 393     case 21866:
 394       __small_sprintf (charset, "CP%u", cp);
 395       return __cp_mbtowc;
 396     case 28591:
 397     case 28592:
 398     case 28593:
 399     case 28594:
 400     case 28595:
 401     case 28596:
 402     case 28597:
 403     case 28598:
 404     case 28599:
 405     case 28603:
 406     case 28605:
 407       __small_sprintf (charset, "ISO-8859-%u", cp - 28590);
 408       return __iso_mbtowc;
 409     case 932:
 410       strcpy (charset, "SJIS");
 411       return __sjis_mbtowc;
 412     case 936:
 413       strcpy (charset, "GBK");
 414       return __gbk_mbtowc;
 415     case 949:
 416     case 51949:
 417       strcpy (charset, "EUCKR");
 418       return __kr_mbtowc;
 419     case 950:
 420       strcpy (charset, "BIG5");
 421       return __big5_mbtowc;
 422     case 50220:
 423       strcpy (charset, "JIS");
 424       return __jis_mbtowc;
 425     case 20932:
 426     case 51932:
 427       strcpy (charset, "EUCJP");
 428       return __eucjp_mbtowc;
 429     case 65001:
 430       strcpy (charset, "UTF-8");
 431       return __utf8_mbtowc;
 432     default:
 433       break;
 434     }
 435   strcpy (charset, "ASCII");
 436   return __ascii_mbtowc;
 437 }
 438
 439 /* Our own sys_wcstombs/sys_mbstowcs functions differ from the
 440    wcstombs/mbstowcs API in three ways:
 441
 442    - The UNICODE private use area is used in filenames to specify
 443      characters not allowed in Windows filenames ('*', '?', etc).
 444      The sys_wcstombs converts characters in the private use area
 445      back to the corresponding ASCII chars.
 446
 447    - If a wide character in a filename has no representation in the current
 448      multibyte charset, then usually you wouldn't be able to access the
 449      file.  To fix this problem, sys_wcstombs creates a replacement multibyte
 450      sequences for the non-representable wide-char.  The sequence starts with
 451      an ASCII CAN (0x18, Ctrl-X), followed by the UTF-8 representation of the
 452      character.  The sys_(cp_)mbstowcs function detects ASCII CAN characters
 453      in the input multibyte string and converts the following multibyte
 454      sequence in by treating it as an UTF-8 char.  If that fails, the ASCII
 455      CAN was probably standalone and it gets just copied over as ASCII CAN.
 456
 457    - The functions always create 0-terminated results, no matter what.
 458      If the result is truncated due to buffer size, it's a bug in Cygwin
 459      and the buffer in the calling function should be raised. */
 460 size_t __stdcall
 461 sys_cp_wcstombs (wctomb_p f_wctomb, const char *charset, char *dst, size_t len,
 462                  const wchar_t *src, size_t nwc)
 463 {
 464   char buf[10];
 465   char *ptr = dst;
 466   wchar_t *pwcs = (wchar_t *) src;
 467   size_t n = 0;
 468   mbstate_t ps;
 469   save_errno save;
 470
 471   memset (&ps, 0, sizeof ps);
 472   if (dst == NULL)
 473     len = (size_t) -1;
 474   while (n < len && nwc-- > 0)
 475     {
 476       wchar_t pw = *pwcs;
 477       int bytes;
 478       unsigned char cwc;
 479
 480       /* Convert UNICODE private use area.  Reverse functionality for the
 481          ASCII area <= 0x7f (only for path names) is transform_chars above.
 482          Reverse functionality for invalid bytes in a multibyte sequence is
 483          in sys_cp_mbstowcs below. */
 484       if ((pw & 0xff00) == 0xf000
 485           && (((cwc = (pw & 0xff)) <= 0x7f && tfx_chars[cwc] >= 0xf000)
 486               || (cwc >= 0x80 && MB_CUR_MAX > 1)))
 487         {
 488           buf[0] = (char) cwc;
 489           bytes = 1;
 490         }
 491       else
 492         {
 493           bytes = f_wctomb (_REENT, buf, pw, charset, &ps);
 494           if (bytes == -1 && *charset != 'U'/*TF-8*/)
 495             {
 496               /* Convert chars invalid in the current codepage to a sequence
 497                  ASCII CAN; UTF-8 representation of invalid char. */
 498               buf[0] = 0x18; /* ASCII CAN */
 499               bytes = __utf8_wctomb (_REENT, buf + 1, pw, charset, &ps);
 500               if (bytes == -1)
 501                 {
 502                   ++pwcs;
 503                   ps.__count = 0;
 504                   continue;
 505                 }
 506               ++bytes; /* Add the ASCII CAN to the byte count. */
 507               if (ps.__count == -4 && nwc > 0)
 508                 {
 509                   /* First half of a surrogate pair. */
 510                   ++pwcs;
 511                   if ((*pwcs & 0xfc00) != 0xdc00) /* Invalid second half. */
 512                     {
 513                       ++pwcs;
 514                       ps.__count = 0;
 515                       continue;
 516                     }
 517                   bytes += __utf8_wctomb (_REENT, buf + bytes, *pwcs, charset,
 518                                           &ps);
 519                   nwc--;
 520                 }
 521             }
 522         }
 523       if (n + bytes <= len)
 524         {
 525           n += bytes;
 526           if (dst)
 527             {
 528               for (int i = 0; i < bytes; ++i)
 529                 *ptr++ = buf[i];
 530             }
 531           if (*pwcs++ == 0x00)
 532             break;
 533         }
 534       else
 535         break;
 536     }
 537   if (n && dst)
 538     {
 539       n = (n < len) ? n : len - 1;
 540       dst[n] = '\0';
 541     }
 542
 543   return n;
 544 }
 545
 546 size_t __stdcall
 547 sys_wcstombs (char *dst, size_t len, const wchar_t * src, size_t nwc)
 548 {
 549   return sys_cp_wcstombs (cygheap->locale.wctomb, cygheap->locale.charset,
 550                           dst, len, src, nwc);
 551 }
 552
 553 /* Allocate a buffer big enough for the string, always including the
 554    terminating '\0'.  The buffer pointer is returned in *dst_p, the return
 555    value is the number of bytes written to the buffer, as usual.
 556    The "type" argument determines where the resulting buffer is stored.
 557    It's either one of the cygheap_types values, or it's "HEAP_NOTHEAP".
 558    In the latter case the allocation uses simple calloc.
 559
 560    Note that this code is shared by cygserver (which requires it via
 561    __small_vsprintf) and so when built there plain calloc is the
 562    only choice.  */
 563 size_t __stdcall
 564 sys_wcstombs_alloc (char **dst_p, int type, const wchar_t *src, size_t nwc)
 565 {
 566   size_t ret;
 567
 568   ret = sys_wcstombs (NULL, (size_t) -1, src, nwc);
 569   if (ret > 0)
 570     {
 571       size_t dlen = ret + 1;
 572
 573       if (type == HEAP_NOTHEAP)
 574         *dst_p = (char *) calloc (dlen, sizeof (char));
 575       else
 576         *dst_p = (char *) ccalloc ((cygheap_types) type, dlen, sizeof (char));
 577       if (!*dst_p)
 578         return 0;
 579       ret = sys_wcstombs (*dst_p, dlen, src, nwc);
 580     }
 581   return ret;
 582 }
 583
 584 /* sys_cp_mbstowcs is actually most of the time called as sys_mbstowcs with
 585    a 0 codepage.  If cp is not 0, the codepage is evaluated and used for the
 586    conversion.  This is so that fhandler_console can switch to an alternate
 587    charset, which is the charset returned by GetConsoleCP ().  Most of the
 588    time this is used for box and line drawing characters. */
 589 size_t __stdcall
 590 sys_cp_mbstowcs (mbtowc_p f_mbtowc, const char *charset, wchar_t *dst,
 591                  size_t dlen, const char *src, size_t nms)
 592 {
 593   wchar_t *ptr = dst;
 594   unsigned const char *pmbs = (unsigned const char *) src;
 595   size_t count = 0;
 596   size_t len = dlen;
 597   int bytes;
 598   mbstate_t ps;
 599   save_errno save;
 600
 601   memset (&ps, 0, sizeof ps);
 602   if (dst == NULL)
 603     len = (size_t)-1;
 604   while (len > 0 && nms > 0)
 605     {
 606       /* ASCII CAN handling. */
 607       if (*pmbs == 0x18)
 608         {
 609           /* Sanity check: If this is a lead CAN byte for a following UTF-8
 610              sequence, there must be at least two more bytes left, and the
 611              next byte must be a valid UTF-8 start byte.  If the charset
 612              isn't UTF-8 anyway, try to convert the following bytes as UTF-8
 613              sequence. */
 614           if (nms > 2 && pmbs[1] >= 0xc2 && pmbs[1] <= 0xf4 && *charset != 'U'/*TF-8*/)
 615             {
 616               bytes = __utf8_mbtowc (_REENT, ptr, (const char *) pmbs + 1,
 617                                      nms - 1, charset, &ps);
 618               if (bytes < 0)
 619                 {
 620                   /* Invalid UTF-8 sequence?  Treat the ASCII CAN character as
 621                      stand-alone ASCII CAN char. */
 622                   bytes = 1;
 623                   if (dst)
 624                     *ptr = 0x18;
 625                   memset (&ps, 0, sizeof ps);
 626                 }
 627               else
 628                 {
 629                   ++bytes; /* Count CAN byte */
 630                   if (bytes > 1 && ps.__count == 4)
 631                     {
 632                       /* First half of a surrogate. */
 633                       wchar_t *ptr2 = dst ? ptr + 1 : NULL;
 634                       int bytes2 = __utf8_mbtowc (_REENT, ptr2,
 635                                                   (const char *) pmbs + bytes,
 636                                                   nms - bytes, charset, &ps);
 637                       if (bytes2 < 0)
 638                         memset (&ps, 0, sizeof ps);
 639                       else
 640                         {
 641                           bytes += bytes2;
 642                           ++count;
 643                           ptr = dst ? ptr + 1 : NULL;
 644                           --len;
 645                         }
 646                     }
 647                 }
 648             }
 649           /* Otherwise it's just a simple ASCII CAN. */
 650           else
 651             {
 652               bytes = 1;
 653               if (dst)
 654                 *ptr = 0x18;
 655             }
 656         }
 657       else if ((bytes = f_mbtowc (_REENT, ptr, (const char *) pmbs, nms,
 658                                   charset, &ps)) < 0)
 659         {
 660           /* The technique is based on a discussion here:
 661              http://www.mail-archive.com/linux-utf8@nl.linux.org/msg00080.html
 662
 663              Invalid bytes in a multibyte secuence are converted to
 664              the private use area which is already used to store ASCII
 665              chars invalid in Windows filenames.  This technque allows
 666              to store them in a symmetric way. */
 667           bytes = 1;
 668           if (dst)
 669             *ptr = L'\xf000' | *pmbs;
 670           memset (&ps, 0, sizeof ps);
 671         }
 672
 673       if (bytes > 0)
 674         {
 675           pmbs += bytes;
 676           nms -= bytes;
 677           ++count;
 678           ptr = dst ? ptr + 1 : NULL;
 679           --len;
 680         }
 681       else
 682         {
 683           if (bytes == 0)
 684             ++count;
 685           break;
 686         }
 687     }
 688
 689   if (count && dst)
 690     {
 691       count = (count < dlen) ? count : dlen - 1;
 692       dst[count] = L'\0';
 693     }
 694
 695   return count;
 696 }
 697
 698 size_t __stdcall
 699 sys_mbstowcs (wchar_t * dst, size_t dlen, const char *src, size_t nms)
 700 {
 701   return sys_cp_mbstowcs (cygheap->locale.mbtowc, cygheap->locale.charset,
 702                           dst, dlen, src, nms);
 703 }
 704
 705 /* Same as sys_wcstombs_alloc, just backwards. */
 706 size_t __stdcall
 707 sys_mbstowcs_alloc (wchar_t **dst_p, int type, const char *src, size_t nms)
 708 {
 709   size_t ret;
 710
 711   ret = sys_mbstowcs (NULL, (size_t) -1, src, nms);
 712   if (ret > 0)
 713     {
 714       size_t dlen = ret + 1;
 715
 716       if (type == HEAP_NOTHEAP)
 717         *dst_p = (wchar_t *) calloc (dlen, sizeof (wchar_t));
 718       else
 719         *dst_p = (wchar_t *) ccalloc ((cygheap_types) type, dlen,
 720                                       sizeof (wchar_t));
 721       if (!*dst_p)
 722         return 0;
 723       ret = sys_mbstowcs (*dst_p, dlen, src, nms);
 724     }
 725   return ret;
 726 }
 727
 728 static WCHAR hex_wchars[] = L"0123456789abcdef";
 729
 730 NTSTATUS NTAPI
 731 RtlInt64ToHexUnicodeString (ULONGLONG value, PUNICODE_STRING dest,
 732                             BOOLEAN append)
 733 {
 734   USHORT len = append ? dest->Length : 0;
 735   if (dest->MaximumLength - len < 16 * (int) sizeof (WCHAR))
 736     return STATUS_BUFFER_OVERFLOW;
 737   wchar_t *end = (PWCHAR) ((PBYTE) dest->Buffer + len);
 738   register PWCHAR p = end + 16;
 739   while (p-- > end)
 740     {
 741       *p = hex_wchars[value & 0xf];
 742       value >>= 4;
 743     }
 744   dest->Length += 16 * sizeof (WCHAR);
 745   return STATUS_SUCCESS;
 746 }