winsup/cygwin/strfuncs.cc

   1 /* strfuncs.cc: misc funcs that don't belong anywhere else
   2
   3    Copyright 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
   4    2005, 2006, 2007, 2008, 2009, 2010, 2011 Red Hat, Inc.
   5
   6 This file is part of Cygwin.
   7
   8 This software is a copyrighted work licensed under the terms of the
   9 Cygwin license.  Please consult the file "CYGWIN_LICENSE" for
  10 details. */
  11
  12 #include "winsup.h"
  13 #include <stdlib.h>
  14 #include <wchar.h>
  15 #include <winnls.h>
  16 #include <ntdll.h>
  17 #include "path.h"
  18 #include "fhandler.h"
  19 #include "dtable.h"
  20 #include "cygheap.h"
  21
  22 /* Transform characters invalid for Windows filenames to the Unicode private
  23    use area in the U+f0XX range.  The affected characters are all control
  24    chars 1 <= c <= 31, as well as the characters " * : < > ? |.  The backslash
  25    is affected as well, but we can't transform it as long as we accept Win32
  26    paths as input. */
  27 static const WCHAR tfx_chars[] = {
  28             0, 0xf000 |   1, 0xf000 |   2, 0xf000 |   3,
  29  0xf000 |   4, 0xf000 |   5, 0xf000 |   6, 0xf000 |   7,
  30  0xf000 |   8, 0xf000 |   9, 0xf000 |  10, 0xf000 |  11,
  31  0xf000 |  12, 0xf000 |  13, 0xf000 |  14, 0xf000 |  15,
  32  0xf000 |  16, 0xf000 |  17, 0xf000 |  18, 0xf000 |  19,
  33  0xf000 |  20, 0xf000 |  21, 0xf000 |  22, 0xf000 |  23,
  34  0xf000 |  24, 0xf000 |  25, 0xf000 |  26, 0xf000 |  27,
  35  0xf000 |  28, 0xf000 |  29, 0xf000 |  30, 0xf000 |  31,
  36           ' ',          '!', 0xf000 | '"',          '#',
  37           '$',          '%',          '&',           39,
  38           '(',          ')', 0xf000 | '*',          '+',
  39           ',',          '-',          '.',          '\\',
  40           '0',          '1',          '2',          '3',
  41           '4',          '5',          '6',          '7',
  42           '8',          '9', 0xf000 | ':',          ';',
  43  0xf000 | '<',          '=', 0xf000 | '>', 0xf000 | '?',
  44           '@',          'A',          'B',          'C',
  45           'D',          'E',          'F',          'G',
  46           'H',          'I',          'J',          'K',
  47           'L',          'M',          'N',          'O',
  48           'P',          'Q',          'R',          'S',
  49           'T',          'U',          'V',          'W',
  50           'X',          'Y',          'Z',          '[',
  51           '\\',          ']',          '^',          '_',
  52           '`',          'a',          'b',          'c',
  53           'd',          'e',          'f',          'g',
  54           'h',          'i',          'j',          'k',
  55           'l',          'm',          'n',          'o',
  56           'p',          'q',          'r',          's',
  57           't',          'u',          'v',          'w',
  58           'x',          'y',          'z',          '{',
  59  0xf000 | '|',          '}',          '~',          127
  60 };
  61
  62 /* This is the table for the reverse functionality in sys_cp_wcstombs.
  63    It differs deliberately in two code places (space and dot) to allow
  64    converting back space and dot on filesystems only supporting DOS
  65    filenames. */
  66 static const WCHAR tfx_rev_chars[] = {
  67             0, 0xf000 |   1, 0xf000 |   2, 0xf000 |   3,
  68  0xf000 |   4, 0xf000 |   5, 0xf000 |   6, 0xf000 |   7,
  69  0xf000 |   8, 0xf000 |   9, 0xf000 |  10, 0xf000 |  11,
  70  0xf000 |  12, 0xf000 |  13, 0xf000 |  14, 0xf000 |  15,
  71  0xf000 |  16, 0xf000 |  17, 0xf000 |  18, 0xf000 |  19,
  72  0xf000 |  20, 0xf000 |  21, 0xf000 |  22, 0xf000 |  23,
  73  0xf000 |  24, 0xf000 |  25, 0xf000 |  26, 0xf000 |  27,
  74  0xf000 |  28, 0xf000 |  29, 0xf000 |  30, 0xf000 |  31,
  75  0xf000 | ' ',          '!', 0xf000 | '"',          '#',
  76           '$',          '%',          '&',           39,
  77           '(',          ')', 0xf000 | '*',          '+',
  78           ',',          '-', 0xf000 | '.',          '\\',
  79           '0',          '1',          '2',          '3',
  80           '4',          '5',          '6',          '7',
  81           '8',          '9', 0xf000 | ':',          ';',
  82  0xf000 | '<',          '=', 0xf000 | '>', 0xf000 | '?',
  83           '@',          'A',          'B',          'C',
  84           'D',          'E',          'F',          'G',
  85           'H',          'I',          'J',          'K',
  86           'L',          'M',          'N',          'O',
  87           'P',          'Q',          'R',          'S',
  88           'T',          'U',          'V',          'W',
  89           'X',          'Y',          'Z',          '[',
  90           '\\',          ']',          '^',          '_',
  91           '`',          'a',          'b',          'c',
  92           'd',          'e',          'f',          'g',
  93           'h',          'i',          'j',          'k',
  94           'l',          'm',          'n',          'o',
  95           'p',          'q',          'r',          's',
  96           't',          'u',          'v',          'w',
  97           'x',          'y',          'z',          '{',
  98  0xf000 | '|',          '}',          '~',          127
  99 };
 100
 101 void
 102 transform_chars (PWCHAR path, PWCHAR path_end)
 103 {
 104   for (; path <= path_end; ++path)
 105     if (*path < 128)
 106       *path = tfx_chars[*path];
 107 }
 108
 109 /* The SJIS, JIS and eucJP conversion in newlib does not use UTF as
 110    wchar_t character representation.  That's unfortunate for us since
 111    we require UTF for the OS.  What we do here is to have our own
 112    implementation of the base functions for the conversion using
 113    the MulitByteToWideChar/WideCharToMultiByte functions. */
 114
 115 /* FIXME: We can't support JIS (ISO-2022-JP) at all right now.  It's a
 116    stateful charset encoding.  The translation from mbtowc to
 117    MulitByteToWideChar is quite complex.  Given that we support SJIS and
 118    eucJP, the both most used Japanese charset encodings, this shouldn't
 119    be such a big problem. */
 120
 121 /* GBK, eucKR, and Big5 conversions are not available so far in newlib. */
 122
 123 static int
 124 __db_wctomb (struct _reent *r, char *s, wchar_t wchar, UINT cp)
 125 {
 126   if (s == NULL)
 127     return 0;
 128
 129   if (wchar < 0x80)
 130     {
 131       *s = (char) wchar;
 132       return 1;
 133     }
 134
 135   BOOL def_used = false;
 136   int ret = WideCharToMultiByte (cp, WC_NO_BEST_FIT_CHARS, &wchar, 1, s,
 137                                  2, NULL, &def_used);
 138   if (ret > 0 && !def_used)
 139     return ret;
 140
 141   r->_errno = EILSEQ;
 142   return -1;
 143 }
 144
 145 extern "C" int
 146 __sjis_wctomb (struct _reent *r, char *s, wchar_t wchar, const char *charset,
 147                mbstate_t *state)
 148 {
 149   return __db_wctomb (r,s, wchar, 932);
 150 }
 151
 152 extern "C" int
 153 __eucjp_wctomb (struct _reent *r, char *s, wchar_t wchar, const char *charset,
 154                mbstate_t *state)
 155 {
 156   /* Unfortunately, the Windows eucJP codepage 20932 is not really 100%
 157      compatible to eucJP.  It's a cute approximation which makes it a
 158      doublebyte codepage.
 159      The JIS-X-0212 three byte codes (0x8f,0xa1-0xfe,0xa1-0xfe) are folded
 160      into two byte codes as follows: The 0x8f is stripped, the next byte is
 161      taken as is, the third byte is mapped into the lower 7-bit area by
 162      masking it with 0x7f.  So, for instance, the eucJP code 0x8f,0xdd,0xf8
 163      becomes 0xdd,0x78 in CP 20932.
 164
 165      To be really eucJP compatible, we have to map the JIS-X-0212 characters
 166      between CP 20932 and eucJP ourselves. */
 167   if (s == NULL)
 168     return 0;
 169
 170   if (wchar < 0x80)
 171     {
 172       *s = (char) wchar;
 173       return 1;
 174     }
 175
 176   BOOL def_used = false;
 177   int ret = WideCharToMultiByte (20932, WC_NO_BEST_FIT_CHARS, &wchar, 1, s,
 178                                  3, NULL, &def_used);
 179   if (ret > 0 && !def_used)
 180     {
 181       /* CP20932 representation of JIS-X-0212 character? */
 182       if (ret == 2 && (unsigned char) s[1] <= 0x7f)
 183         {
 184           /* Yes, convert to eucJP three byte sequence */
 185           s[2] = s[1] | 0x80;
 186           s[1] = s[0];
 187           s[0] = 0x8f;
 188           ++ret;
 189         }
 190       return ret;
 191     }
 192
 193   r->_errno = EILSEQ;
 194   return -1;
 195 }
 196
 197 extern "C" int
 198 __gbk_wctomb (struct _reent *r, char *s, wchar_t wchar, const char *charset,
 199                mbstate_t *state)
 200 {
 201   return __db_wctomb (r,s, wchar, 936);
 202 }
 203
 204 extern "C" int
 205 __kr_wctomb (struct _reent *r, char *s, wchar_t wchar, const char *charset,
 206                mbstate_t *state)
 207 {
 208   return __db_wctomb (r,s, wchar, 949);
 209 }
 210
 211 extern "C" int
 212 __big5_wctomb (struct _reent *r, char *s, wchar_t wchar, const char *charset,
 213                mbstate_t *state)
 214 {
 215   return __db_wctomb (r,s, wchar, 950);
 216 }
 217
 218 static int
 219 __db_mbtowc (struct _reent *r, wchar_t *pwc, const char *s, size_t n, UINT cp,
 220              mbstate_t *state)
 221 {
 222   wchar_t dummy;
 223   int ret;
 224
 225   if (s == NULL)
 226     return 0;  /* not state-dependent */
 227
 228   if (n == 0)
 229     return -2;
 230
 231   if (pwc == NULL)
 232     pwc = &dummy;
 233
 234   if (state->__count == 0)
 235     {
 236       if (*(unsigned char *) s < 0x80)
 237         {
 238           *pwc = *(unsigned char *) s;
 239           return *s ? 1 : 0;
 240         }
 241       size_t cnt = min (n, 2);
 242       ret = MultiByteToWideChar (cp, MB_ERR_INVALID_CHARS, s, cnt, pwc, 1);
 243       if (ret)
 244         return cnt;
 245       if (n == 1)
 246         {
 247           state->__count = n;
 248           state->__value.__wchb[0] = *s;
 249           return -2;
 250         }
 251       /* These Win32 functions are really crappy.  Assuming n is 2 but the
 252          first byte is a singlebyte charcode, the function does not convert
 253          that byte and return 1, rather it just returns 0.  So, what we do
 254          here is to check if the first byte returns a valid value... */
 255       else if (MultiByteToWideChar (cp, MB_ERR_INVALID_CHARS, s, 1, pwc, 1))
 256         return 1;
 257       r->_errno = EILSEQ;
 258       return -1;
 259     }
 260   state->__value.__wchb[state->__count] = *s;
 261   ret = MultiByteToWideChar (cp, MB_ERR_INVALID_CHARS,
 262                              (const char *) state->__value.__wchb, 2, pwc, 1);
 263   if (!ret)
 264     {
 265       r->_errno = EILSEQ;
 266       return -1;
 267     }
 268   state->__count = 0;
 269   return 1;
 270 }
 271
 272 extern "C" int
 273 __sjis_mbtowc (struct _reent *r, wchar_t *pwc, const char *s, size_t n,
 274                const char *charset, mbstate_t *state)
 275 {
 276   return __db_mbtowc (r, pwc, s, n, 932, state);
 277 }
 278
 279 extern "C" int
 280 __eucjp_mbtowc (struct _reent *r, wchar_t *pwc, const char *s, size_t n,
 281                 const char *charset, mbstate_t *state)
 282 {
 283   /* See comment in __eucjp_wctomb above. */
 284   wchar_t dummy;
 285   int ret = 0;
 286
 287   if (s == NULL)
 288     return 0;  /* not state-dependent */
 289
 290   if (n == 0)
 291     return -2;
 292
 293   if (pwc == NULL)
 294     pwc = &dummy;
 295
 296   if (state->__count == 0)
 297     {
 298       if (*(unsigned char *) s < 0x80)
 299         {
 300           *pwc = *(unsigned char *) s;
 301           return *s ? 1 : 0;
 302         }
 303       if (*(unsigned char *) s == 0x8f) /* JIS-X-0212 lead byte? */
 304         {
 305           /* Yes.  Store sequence in mbstate and handle in the __count != 0
 306              case at the end of the function. */
 307           size_t i;
 308           for (i = 0; i < 3 && i < n; i++)
 309             state->__value.__wchb[i] = s[i];
 310           if ((state->__count = i) < 3) /* Incomplete sequence? */
 311             return -2;
 312           ret = 3;
 313           goto jis_x_0212;
 314         }
 315       size_t cnt = min (n, 2);
 316       if (MultiByteToWideChar (20932, MB_ERR_INVALID_CHARS, s, cnt, pwc, 1))
 317         return cnt;
 318       if (n == 1)
 319         {
 320           state->__count = 1;
 321           state->__value.__wchb[0] = *s;
 322           return -2;
 323         }
 324       else if (MultiByteToWideChar (20932, MB_ERR_INVALID_CHARS, s, 1, pwc, 1))
 325         return 1;
 326       r->_errno = EILSEQ;
 327       return -1;
 328     }
 329   state->__value.__wchb[state->__count++] = *s;
 330   ret = 1;
 331 jis_x_0212:
 332   if (state->__value.__wchb[0] == 0x8f)
 333     {
 334       if (state->__count == 2)
 335         {
 336           if (n == 1)
 337             return -2;
 338           state->__value.__wchb[state->__count] = s[1];
 339           ret = 2;
 340         }
 341       /* Ok, we have a full JIS-X-0212 sequence in mbstate.  Convert it
 342          to the CP 20932 representation and feed it to MultiByteToWideChar. */
 343       state->__value.__wchb[0] = state->__value.__wchb[1];
 344       state->__value.__wchb[1] = state->__value.__wchb[2] & 0x7f;
 345     }
 346   if (!MultiByteToWideChar (20932, MB_ERR_INVALID_CHARS,
 347                             (const char *) state->__value.__wchb, 2, pwc, 1))
 348     {
 349       r->_errno = EILSEQ;
 350       return -1;
 351     }
 352   state->__count = 0;
 353   return ret;
 354 }
 355
 356 extern "C" int
 357 __gbk_mbtowc (struct _reent *r, wchar_t *pwc, const char *s, size_t n,
 358                const char *charset, mbstate_t *state)
 359 {
 360   return __db_mbtowc (r, pwc, s, n, 936, state);
 361 }
 362
 363 extern "C" int
 364 __kr_mbtowc (struct _reent *r, wchar_t *pwc, const char *s, size_t n,
 365                const char *charset, mbstate_t *state)
 366 {
 367   return __db_mbtowc (r, pwc, s, n, 949, state);
 368 }
 369
 370 extern "C" int
 371 __big5_mbtowc (struct _reent *r, wchar_t *pwc, const char *s, size_t n,
 372                const char *charset, mbstate_t *state)
 373 {
 374   return __db_mbtowc (r, pwc, s, n, 950, state);
 375 }
 376
 377 /* Our own sys_wcstombs/sys_mbstowcs functions differ from the
 378    wcstombs/mbstowcs API in three ways:
 379
 380    - The UNICODE private use area is used in filenames to specify
 381      characters not allowed in Windows filenames ('*', '?', etc).
 382      The sys_wcstombs converts characters in the private use area
 383      back to the corresponding ASCII chars.
 384
 385    - If a wide character in a filename has no representation in the current
 386      multibyte charset, then usually you wouldn't be able to access the
 387      file.  To fix this problem, sys_wcstombs creates a replacement multibyte
 388      sequences for the non-representable wide-char.  The sequence starts with
 389      an ASCII CAN (0x18, Ctrl-X), followed by the UTF-8 representation of the
 390      character.  The sys_(cp_)mbstowcs function detects ASCII CAN characters
 391      in the input multibyte string and converts the following multibyte
 392      sequence in by treating it as an UTF-8 char.  If that fails, the ASCII
 393      CAN was probably standalone and it gets just copied over as ASCII CAN.
 394
 395    - The functions always create 0-terminated results, no matter what.
 396      If the result is truncated due to buffer size, it's a bug in Cygwin
 397      and the buffer in the calling function should be raised. */
 398 size_t __stdcall
 399 sys_cp_wcstombs (wctomb_p f_wctomb, const char *charset, char *dst, size_t len,
 400                  const wchar_t *src, size_t nwc)
 401 {
 402   char buf[10];
 403   char *ptr = dst;
 404   wchar_t *pwcs = (wchar_t *) src;
 405   size_t n = 0;
 406   mbstate_t ps;
 407   save_errno save;
 408
 409   memset (&ps, 0, sizeof ps);
 410   if (dst == NULL)
 411     len = (size_t) -1;
 412   while (n < len && nwc-- > 0)
 413     {
 414       wchar_t pw = *pwcs;
 415       int bytes;
 416       unsigned char cwc;
 417
 418       /* Convert UNICODE private use area.  Reverse functionality for the
 419          ASCII area <= 0x7f (only for path names) is transform_chars above.
 420          Reverse functionality for invalid bytes in a multibyte sequence is
 421          in sys_cp_mbstowcs below. */
 422       if ((pw & 0xff00) == 0xf000
 423           && (((cwc = (pw & 0xff)) <= 0x7f && tfx_rev_chars[cwc] >= 0xf000)
 424               || (cwc >= 0x80 && MB_CUR_MAX > 1)))
 425         {
 426           buf[0] = (char) cwc;
 427           bytes = 1;
 428         }
 429       else
 430         {
 431           bytes = f_wctomb (_REENT, buf, pw, charset, &ps);
 432           if (bytes == -1 && *charset != 'U'/*TF-8*/)
 433             {
 434               /* Convert chars invalid in the current codepage to a sequence
 435                  ASCII CAN; UTF-8 representation of invalid char. */
 436               buf[0] = 0x18; /* ASCII CAN */
 437               bytes = __utf8_wctomb (_REENT, buf + 1, pw, charset, &ps);
 438               if (bytes == -1)
 439                 {
 440                   ++pwcs;
 441                   ps.__count = 0;
 442                   continue;
 443                 }
 444               ++bytes; /* Add the ASCII CAN to the byte count. */
 445               if (ps.__count == -4 && nwc > 0)
 446                 {
 447                   /* First half of a surrogate pair. */
 448                   ++pwcs;
 449                   if ((*pwcs & 0xfc00) != 0xdc00) /* Invalid second half. */
 450                     {
 451                       ++pwcs;
 452                       ps.__count = 0;
 453                       continue;
 454                     }
 455                   bytes += __utf8_wctomb (_REENT, buf + bytes, *pwcs, charset,
 456                                           &ps);
 457                   nwc--;
 458                 }
 459             }
 460         }
 461       if (n + bytes <= len)
 462         {
 463           n += bytes;
 464           if (dst)
 465             {
 466               for (int i = 0; i < bytes; ++i)
 467                 *ptr++ = buf[i];
 468             }
 469           if (*pwcs++ == 0x00)
 470             break;
 471         }
 472       else
 473         break;
 474     }
 475   if (n && dst)
 476     {
 477       n = (n < len) ? n : len - 1;
 478       dst[n] = '\0';
 479     }
 480
 481   return n;
 482 }
 483
 484 size_t __stdcall
 485 sys_wcstombs (char *dst, size_t len, const wchar_t * src, size_t nwc)
 486 {
 487   return sys_cp_wcstombs (cygheap->locale.wctomb, cygheap->locale.charset,
 488                           dst, len, src, nwc);
 489 }
 490
 491 /* Allocate a buffer big enough for the string, always including the
 492    terminating '\0'.  The buffer pointer is returned in *dst_p, the return
 493    value is the number of bytes written to the buffer, as usual.
 494    The "type" argument determines where the resulting buffer is stored.
 495    It's either one of the cygheap_types values, or it's "HEAP_NOTHEAP".
 496    In the latter case the allocation uses simple calloc.
 497
 498    Note that this code is shared by cygserver (which requires it via
 499    __small_vsprintf) and so when built there plain calloc is the
 500    only choice.  */
 501 size_t __stdcall
 502 sys_wcstombs_alloc (char **dst_p, int type, const wchar_t *src, size_t nwc)
 503 {
 504   size_t ret;
 505
 506   ret = sys_wcstombs (NULL, (size_t) -1, src, nwc);
 507   if (ret > 0)
 508     {
 509       size_t dlen = ret + 1;
 510
 511       if (type == HEAP_NOTHEAP)
 512         *dst_p = (char *) calloc (dlen, sizeof (char));
 513       else
 514         *dst_p = (char *) ccalloc ((cygheap_types) type, dlen, sizeof (char));
 515       if (!*dst_p)
 516         return 0;
 517       ret = sys_wcstombs (*dst_p, dlen, src, nwc);
 518     }
 519   return ret;
 520 }
 521
 522 /* sys_cp_mbstowcs is actually most of the time called as sys_mbstowcs with
 523    a 0 codepage.  If cp is not 0, the codepage is evaluated and used for the
 524    conversion.  This is so that fhandler_console can switch to an alternate
 525    charset, which is the charset returned by GetConsoleCP ().  Most of the
 526    time this is used for box and line drawing characters. */
 527 size_t __stdcall
 528 sys_cp_mbstowcs (mbtowc_p f_mbtowc, const char *charset, wchar_t *dst,
 529                  size_t dlen, const char *src, size_t nms)
 530 {
 531   wchar_t *ptr = dst;
 532   unsigned const char *pmbs = (unsigned const char *) src;
 533   size_t count = 0;
 534   size_t len = dlen;
 535   int bytes;
 536   mbstate_t ps;
 537   save_errno save;
 538
 539   memset (&ps, 0, sizeof ps);
 540   if (dst == NULL)
 541     len = (size_t)-1;
 542   while (len > 0 && nms > 0)
 543     {
 544       /* ASCII CAN handling. */
 545       if (*pmbs == 0x18)
 546         {
 547           /* Sanity check: If this is a lead CAN byte for a following UTF-8
 548              sequence, there must be at least two more bytes left, and the
 549              next byte must be a valid UTF-8 start byte.  If the charset
 550              isn't UTF-8 anyway, try to convert the following bytes as UTF-8
 551              sequence. */
 552           if (nms > 2 && pmbs[1] >= 0xc2 && pmbs[1] <= 0xf4 && *charset != 'U'/*TF-8*/)
 553             {
 554               bytes = __utf8_mbtowc (_REENT, ptr, (const char *) pmbs + 1,
 555                                      nms - 1, charset, &ps);
 556               if (bytes < 0)
 557                 {
 558                   /* Invalid UTF-8 sequence?  Treat the ASCII CAN character as
 559                      stand-alone ASCII CAN char. */
 560                   bytes = 1;
 561                   if (dst)
 562                     *ptr = 0x18;
 563                   memset (&ps, 0, sizeof ps);
 564                 }
 565               else
 566                 {
 567                   ++bytes; /* Count CAN byte */
 568                   if (bytes > 1 && ps.__count == 4)
 569                     {
 570                       /* First half of a surrogate. */
 571                       wchar_t *ptr2 = dst ? ptr + 1 : NULL;
 572                       int bytes2 = __utf8_mbtowc (_REENT, ptr2,
 573                                                   (const char *) pmbs + bytes,
 574                                                   nms - bytes, charset, &ps);
 575                       if (bytes2 < 0)
 576                         memset (&ps, 0, sizeof ps);
 577                       else
 578                         {
 579                           bytes += bytes2;
 580                           ++count;
 581                           ptr = dst ? ptr + 1 : NULL;
 582                           --len;
 583                         }
 584                     }
 585                 }
 586             }
 587           /* Otherwise it's just a simple ASCII CAN. */
 588           else
 589             {
 590               bytes = 1;
 591               if (dst)
 592                 *ptr = 0x18;
 593             }
 594         }
 595       else if ((bytes = f_mbtowc (_REENT, ptr, (const char *) pmbs, nms,
 596                                   charset, &ps)) < 0)
 597         {
 598           /* The technique is based on a discussion here:
 599              http://www.mail-archive.com/linux-utf8@nl.linux.org/msg00080.html
 600
 601              Invalid bytes in a multibyte secuence are converted to
 602              the private use area which is already used to store ASCII
 603              chars invalid in Windows filenames.  This technque allows
 604              to store them in a symmetric way. */
 605           bytes = 1;
 606           if (dst)
 607             *ptr = L'\xf000' | *pmbs;
 608           memset (&ps, 0, sizeof ps);
 609         }
 610
 611       if (bytes > 0)
 612         {
 613           pmbs += bytes;
 614           nms -= bytes;
 615           ++count;
 616           ptr = dst ? ptr + 1 : NULL;
 617           --len;
 618         }
 619       else
 620         {
 621           if (bytes == 0)
 622             ++count;
 623           break;
 624         }
 625     }
 626
 627   if (count && dst)
 628     {
 629       count = (count < dlen) ? count : dlen - 1;
 630       dst[count] = L'\0';
 631     }
 632
 633   return count;
 634 }
 635
 636 size_t __stdcall
 637 sys_mbstowcs (wchar_t * dst, size_t dlen, const char *src, size_t nms)
 638 {
 639   return sys_cp_mbstowcs (cygheap->locale.mbtowc, cygheap->locale.charset,
 640                           dst, dlen, src, nms);
 641 }
 642
 643 /* Same as sys_wcstombs_alloc, just backwards. */
 644 size_t __stdcall
 645 sys_mbstowcs_alloc (wchar_t **dst_p, int type, const char *src, size_t nms)
 646 {
 647   size_t ret;
 648
 649   ret = sys_mbstowcs (NULL, (size_t) -1, src, nms);
 650   if (ret > 0)
 651     {
 652       size_t dlen = ret + 1;
 653
 654       if (type == HEAP_NOTHEAP)
 655         *dst_p = (wchar_t *) calloc (dlen, sizeof (wchar_t));
 656       else
 657         *dst_p = (wchar_t *) ccalloc ((cygheap_types) type, dlen,
 658                                       sizeof (wchar_t));
 659       if (!*dst_p)
 660         return 0;
 661       ret = sys_mbstowcs (*dst_p, dlen, src, nms);
 662     }
 663   return ret;
 664 }
 665
 666 /* Copy string, until c or <nul> is encountered.
 667    NUL-terminate the destination string (s1).
 668    Return pointer to terminating byte in dst string.  */
 669 char * __stdcall
 670 strccpy (char *s1, const char **s2, char c)
 671 {
 672   while (**s2 && **s2 != c)
 673     *s1++ = *((*s2)++);
 674   *s1 = 0;
 675
 676   MALLOC_CHECK;
 677   return s1;
 678 }
 679
 680 static WCHAR hex_wchars[] = L"0123456789abcdef";
 681
 682 NTSTATUS NTAPI
 683 RtlInt64ToHexUnicodeString (ULONGLONG value, PUNICODE_STRING dest,
 684                             BOOLEAN append)
 685 {
 686   USHORT len = append ? dest->Length : 0;
 687   if (dest->MaximumLength - len < 16 * (int) sizeof (WCHAR))
 688     return STATUS_BUFFER_OVERFLOW;
 689   wchar_t *end = (PWCHAR) ((PBYTE) dest->Buffer + len);
 690   register PWCHAR p = end + 16;
 691   while (p-- > end)
 692     {
 693       *p = hex_wchars[value & 0xf];
 694       value >>= 4;
 695     }
 696   dest->Length += 16 * sizeof (WCHAR);
 697   return STATUS_SUCCESS;
 698 }