1 /* strfuncs.cc: misc funcs that don't belong anywhere else
3 Copyright 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
4 2005, 2006, 2007, 2008, 2009 Red Hat, Inc.
6 This file is part of Cygwin.
8 This software is a copyrighted work licensed under the terms of the
9 Cygwin license. Please consult the file "CYGWIN_LICENSE" for
25 /* Transform characters invalid for Windows filenames to the Unicode private
26 use area in the U+f0XX range. The affected characters are all control
27 chars 1 <= c <= 31, as well as the characters " * : < > ? |. The backslash
28 is affected as well, but we can't transform it as long as we accept Win32
30 The reverse functionality is in function sys_cp_wcstombs. */
31 static const WCHAR tfx_chars[] = {
32 0, 0xf000 | 1, 0xf000 | 2, 0xf000 | 3,
33 0xf000 | 4, 0xf000 | 5, 0xf000 | 6, 0xf000 | 7,
34 0xf000 | 8, 0xf000 | 9, 0xf000 | 10, 0xf000 | 11,
35 0xf000 | 12, 0xf000 | 13, 0xf000 | 14, 0xf000 | 15,
36 0xf000 | 16, 0xf000 | 17, 0xf000 | 18, 0xf000 | 19,
37 0xf000 | 20, 0xf000 | 21, 0xf000 | 22, 0xf000 | 23,
38 0xf000 | 24, 0xf000 | 25, 0xf000 | 26, 0xf000 | 27,
39 0xf000 | 28, 0xf000 | 29, 0xf000 | 30, 0xf000 | 31,
40 ' ', '!', 0xf000 | '"', '#',
42 '(', ')', 0xf000 | '*', '+',
46 '8', '9', 0xf000 | ':', ';',
47 0xf000 | '<', '=', 0xf000 | '>', 0xf000 | '?',
63 0xf000 | '|', '}', '~', 127
67 transform_chars (PWCHAR path, PWCHAR path_end)
69 for (; path <= path_end; ++path)
71 *path = tfx_chars[*path];
74 /* The SJIS, JIS and eucJP conversion in newlib does not use UTF as
75 wchar_t character representation. That's unfortunate for us since
76 we require UTF for the OS. What we do here is to have our own
77 implementation of the base functions for the conversion using
78 the MulitByteToWideChar/WideCharToMultiByte functions. */
80 /* FIXME: We can't support JIS (ISO-2022-JP) at all right now. It's a
81 stateful charset encoding. The translation from mbtowc to
82 MulitByteToWideChar is quite complex. Given that we support SJIS and
83 eucJP, the both most used Japanese charset encodings, this shouldn't
84 be such a big problem. */
86 /* GBK, eucKR, and Big5 conversions are not available so far in newlib. */
89 __db_wctomb (struct _reent *r, char *s, wchar_t wchar, UINT cp)
100 BOOL def_used = false;
101 int ret = WideCharToMultiByte (cp, WC_NO_BEST_FIT_CHARS, &wchar, 1, s,
103 if (ret > 0 && !def_used)
111 __sjis_wctomb (struct _reent *r, char *s, wchar_t wchar, const char *charset,
114 return __db_wctomb (r,s, wchar, 932);
118 __jis_wctomb (struct _reent *r, char *s, wchar_t wchar, const char *charset,
121 /* FIXME: See comment at start of file. */
122 return __ascii_wctomb (r, s, wchar, charset, state);
126 __eucjp_wctomb (struct _reent *r, char *s, wchar_t wchar, const char *charset,
129 /* Unfortunately, the Windows eucJP codepage 20932 is not really 100%
130 compatible to eucJP. It's a cute approximation which makes it a
132 The JIS-X-0212 three byte codes (0x8f,0xa1-0xfe,0xa1-0xfe) are folded
133 into two byte codes as follows: The 0x8f is stripped, the next byte is
134 taken as is, the third byte is mapped into the lower 7-bit area by
135 masking it with 0x7f. So, for instance, the eucJP code 0x8f,0xdd,0xf8
136 becomes 0xdd,0x78 in CP 20932.
138 To be really eucJP compatible, we have to map the JIS-X-0212 characters
139 between CP 20932 and eucJP ourselves. */
149 BOOL def_used = false;
150 int ret = WideCharToMultiByte (20932, WC_NO_BEST_FIT_CHARS, &wchar, 1, s,
152 if (ret > 0 && !def_used)
154 /* CP20932 representation of JIS-X-0212 character? */
155 if (ret == 2 && (unsigned char) s[1] <= 0x7f)
157 /* Yes, convert to eucJP three byte sequence */
171 __gbk_wctomb (struct _reent *r, char *s, wchar_t wchar, const char *charset,
174 return __db_wctomb (r,s, wchar, 936);
178 __kr_wctomb (struct _reent *r, char *s, wchar_t wchar, const char *charset,
181 return __db_wctomb (r,s, wchar, 949);
185 __big5_wctomb (struct _reent *r, char *s, wchar_t wchar, const char *charset,
188 return __db_wctomb (r,s, wchar, 950);
192 __db_mbtowc (struct _reent *r, wchar_t *pwc, const char *s, size_t n, UINT cp,
199 return 0; /* not state-dependent */
207 if (state->__count == 0)
209 if (*(unsigned char *) s < 0x80)
211 *pwc = *(unsigned char *) s;
214 size_t cnt = min (n, 2);
215 ret = MultiByteToWideChar (cp, MB_ERR_INVALID_CHARS, s, cnt, pwc, 1);
221 state->__value.__wchb[0] = *s;
224 /* These Win32 functions are really crappy. Assuming n is 2 but the
225 first byte is a singlebyte charcode, the function does not convert
226 that byte and return 1, rather it just returns 0. So, what we do
227 here is to check if the first byte returns a valid value... */
228 else if (MultiByteToWideChar (cp, MB_ERR_INVALID_CHARS, s, 1, pwc, 1))
233 state->__value.__wchb[state->__count] = *s;
234 ret = MultiByteToWideChar (cp, MB_ERR_INVALID_CHARS,
235 (const char *) state->__value.__wchb, 2, pwc, 1);
246 __sjis_mbtowc (struct _reent *r, wchar_t *pwc, const char *s, size_t n,
247 const char *charset, mbstate_t *state)
249 return __db_mbtowc (r, pwc, s, n, 932, state);
253 __jis_mbtowc (struct _reent *r, wchar_t *pwc, const char *s, size_t n,
254 const char *charset, mbstate_t *state)
256 /* FIXME: See comment at start of file. */
257 return __ascii_mbtowc (r, pwc, s, n, charset, state);
261 __eucjp_mbtowc (struct _reent *r, wchar_t *pwc, const char *s, size_t n,
262 const char *charset, mbstate_t *state)
264 /* See comment in __eucjp_wctomb above. */
269 return 0; /* not state-dependent */
277 if (state->__count == 0)
279 if (*(unsigned char *) s < 0x80)
281 *pwc = *(unsigned char *) s;
284 if (*(unsigned char *) s == 0x8f) /* JIS-X-0212 lead byte? */
286 /* Yes. Store sequence in mbstate and handle in the __count != 0
287 case at the end of the function. */
289 for (i = 0; i < 3 && i < n; i++)
290 state->__value.__wchb[i] = s[i];
291 if ((state->__count = i) < 3) /* Incomplete sequence? */
296 size_t cnt = min (n, 2);
297 if (MultiByteToWideChar (20932, MB_ERR_INVALID_CHARS, s, cnt, pwc, 1))
302 state->__value.__wchb[0] = *s;
305 else if (MultiByteToWideChar (20932, MB_ERR_INVALID_CHARS, s, 1, pwc, 1))
310 state->__value.__wchb[state->__count++] = *s;
313 if (state->__value.__wchb[0] == 0x8f)
315 if (state->__count == 2)
319 state->__value.__wchb[state->__count] = s[1];
322 /* Ok, we have a full JIS-X-0212 sequence in mbstate. Convert it
323 to the CP 20932 representation and feed it to MultiByteToWideChar. */
324 state->__value.__wchb[0] = state->__value.__wchb[1];
325 state->__value.__wchb[1] = state->__value.__wchb[2] & 0x7f;
327 if (!MultiByteToWideChar (20932, MB_ERR_INVALID_CHARS,
328 (const char *) state->__value.__wchb, 2, pwc, 1))
338 __gbk_mbtowc (struct _reent *r, wchar_t *pwc, const char *s, size_t n,
339 const char *charset, mbstate_t *state)
341 return __db_mbtowc (r, pwc, s, n, 936, state);
345 __kr_mbtowc (struct _reent *r, wchar_t *pwc, const char *s, size_t n,
346 const char *charset, mbstate_t *state)
348 return __db_mbtowc (r, pwc, s, n, 949, state);
352 __big5_mbtowc (struct _reent *r, wchar_t *pwc, const char *s, size_t n,
353 const char *charset, mbstate_t *state)
355 return __db_mbtowc (r, pwc, s, n, 950, state);
358 /* Convert Windows codepage to a setlocale compatible character set code.
359 Called from newlib's setlocale() with codepage set to 0, if the
360 charset isn't given explicitely in the POSIX compatible locale specifier.
361 The function also returns a pointer to the corresponding _mbtowc_r
364 __set_charset_from_codepage (UINT cp, char *charset)
394 __small_sprintf (charset, "CP%u", cp);
407 __small_sprintf (charset, "ISO-8859-%u", cp - 28590);
410 strcpy (charset, "SJIS");
411 return __sjis_mbtowc;
413 strcpy (charset, "GBK");
417 strcpy (charset, "EUCKR");
420 strcpy (charset, "BIG5");
421 return __big5_mbtowc;
423 strcpy (charset, "JIS");
427 strcpy (charset, "EUCJP");
428 return __eucjp_mbtowc;
430 strcpy (charset, "UTF-8");
431 return __utf8_mbtowc;
435 strcpy (charset, "ASCII");
436 return __ascii_mbtowc;
439 /* Our own sys_wcstombs/sys_mbstowcs functions differ from the
440 wcstombs/mbstowcs API in three ways:
442 - The UNICODE private use area is used in filenames to specify
443 characters not allowed in Windows filenames ('*', '?', etc).
444 The sys_wcstombs converts characters in the private use area
445 back to the corresponding ASCII chars.
447 - If a wide character in a filename has no representation in the current
448 multibyte charset, then usually you wouldn't be able to access the
449 file. To fix this problem, sys_wcstombs creates a replacement multibyte
450 sequences for the non-representable wide-char. The sequence starts with
451 an ASCII CAN (0x18, Ctrl-X), followed by the UTF-8 representation of the
452 character. The sys_(cp_)mbstowcs function detects ASCII CAN characters
453 in the input multibyte string and converts the following multibyte
454 sequence in by treating it as an UTF-8 char. If that fails, the ASCII
455 CAN was probably standalone and it gets just copied over as ASCII CAN.
457 - The functions always create 0-terminated results, no matter what.
458 If the result is truncated due to buffer size, it's a bug in Cygwin
459 and the buffer in the calling function should be raised. */
461 sys_cp_wcstombs (wctomb_p f_wctomb, const char *charset, char *dst, size_t len,
462 const wchar_t *src, size_t nwc)
466 wchar_t *pwcs = (wchar_t *) src;
471 memset (&ps, 0, sizeof ps);
474 while (n < len && nwc-- > 0)
480 /* Convert UNICODE private use area. Reverse functionality for the
481 ASCII area <= 0x7f (only for path names) is transform_chars above.
482 Reverse functionality for invalid bytes in a multibyte sequence is
483 in sys_cp_mbstowcs below. */
484 if ((pw & 0xff00) == 0xf000
485 && (((cwc = (pw & 0xff)) <= 0x7f && tfx_chars[cwc] >= 0xf000)
486 || (cwc >= 0x80 && MB_CUR_MAX > 1)))
493 bytes = f_wctomb (_REENT, buf, pw, charset, &ps);
494 if (bytes == -1 && *charset != 'U'/*TF-8*/)
496 /* Convert chars invalid in the current codepage to a sequence
497 ASCII CAN; UTF-8 representation of invalid char. */
498 buf[0] = 0x18; /* ASCII CAN */
499 bytes = __utf8_wctomb (_REENT, buf + 1, pw, charset, &ps);
506 ++bytes; /* Add the ASCII CAN to the byte count. */
507 if (ps.__count == -4 && nwc > 0)
509 /* First half of a surrogate pair. */
511 if ((*pwcs & 0xfc00) != 0xdc00) /* Invalid second half. */
517 bytes += __utf8_wctomb (_REENT, buf + bytes, *pwcs, charset,
523 if (n + bytes <= len)
528 for (int i = 0; i < bytes; ++i)
539 n = (n < len) ? n : len - 1;
547 sys_wcstombs (char *dst, size_t len, const wchar_t * src, size_t nwc)
549 return sys_cp_wcstombs (cygheap->locale.wctomb, cygheap->locale.charset,
553 /* Allocate a buffer big enough for the string, always including the
554 terminating '\0'. The buffer pointer is returned in *dst_p, the return
555 value is the number of bytes written to the buffer, as usual.
556 The "type" argument determines where the resulting buffer is stored.
557 It's either one of the cygheap_types values, or it's "HEAP_NOTHEAP".
558 In the latter case the allocation uses simple calloc.
560 Note that this code is shared by cygserver (which requires it via
561 __small_vsprintf) and so when built there plain calloc is the
564 sys_wcstombs_alloc (char **dst_p, int type, const wchar_t *src, size_t nwc)
568 ret = sys_wcstombs (NULL, (size_t) -1, src, nwc);
571 size_t dlen = ret + 1;
573 if (type == HEAP_NOTHEAP)
574 *dst_p = (char *) calloc (dlen, sizeof (char));
576 *dst_p = (char *) ccalloc ((cygheap_types) type, dlen, sizeof (char));
579 ret = sys_wcstombs (*dst_p, dlen, src, nwc);
584 /* sys_cp_mbstowcs is actually most of the time called as sys_mbstowcs with
585 a 0 codepage. If cp is not 0, the codepage is evaluated and used for the
586 conversion. This is so that fhandler_console can switch to an alternate
587 charset, which is the charset returned by GetConsoleCP (). Most of the
588 time this is used for box and line drawing characters. */
590 sys_cp_mbstowcs (mbtowc_p f_mbtowc, const char *charset, wchar_t *dst,
591 size_t dlen, const char *src, size_t nms)
594 unsigned const char *pmbs = (unsigned const char *) src;
601 memset (&ps, 0, sizeof ps);
604 while (len > 0 && nms > 0)
606 /* ASCII CAN handling. */
609 /* Sanity check: If this is a lead CAN byte for a following UTF-8
610 sequence, there must be at least two more bytes left, and the
611 next byte must be a valid UTF-8 start byte. If the charset
612 isn't UTF-8 anyway, try to convert the following bytes as UTF-8
614 if (nms > 2 && pmbs[1] >= 0xc2 && pmbs[1] <= 0xf4 && *charset != 'U'/*TF-8*/)
616 bytes = __utf8_mbtowc (_REENT, ptr, (const char *) pmbs + 1,
617 nms - 1, charset, &ps);
620 /* Invalid UTF-8 sequence? Treat the ASCII CAN character as
621 stand-alone ASCII CAN char. */
625 memset (&ps, 0, sizeof ps);
629 ++bytes; /* Count CAN byte */
630 if (bytes > 1 && ps.__count == 4)
632 /* First half of a surrogate. */
633 wchar_t *ptr2 = dst ? ptr + 1 : NULL;
634 int bytes2 = __utf8_mbtowc (_REENT, ptr2,
635 (const char *) pmbs + bytes,
636 nms - bytes, charset, &ps);
638 memset (&ps, 0, sizeof ps);
643 ptr = dst ? ptr + 1 : NULL;
649 /* Otherwise it's just a simple ASCII CAN. */
657 else if ((bytes = f_mbtowc (_REENT, ptr, (const char *) pmbs, nms,
660 /* The technique is based on a discussion here:
661 http://www.mail-archive.com/linux-utf8@nl.linux.org/msg00080.html
663 Invalid bytes in a multibyte secuence are converted to
664 the private use area which is already used to store ASCII
665 chars invalid in Windows filenames. This technque allows
666 to store them in a symmetric way. */
669 *ptr = L'\xf000' | *pmbs;
670 memset (&ps, 0, sizeof ps);
678 ptr = dst ? ptr + 1 : NULL;
691 count = (count < dlen) ? count : dlen - 1;
699 sys_mbstowcs (wchar_t * dst, size_t dlen, const char *src, size_t nms)
701 return sys_cp_mbstowcs (cygheap->locale.mbtowc, cygheap->locale.charset,
702 dst, dlen, src, nms);
705 /* Same as sys_wcstombs_alloc, just backwards. */
707 sys_mbstowcs_alloc (wchar_t **dst_p, int type, const char *src, size_t nms)
711 ret = sys_mbstowcs (NULL, (size_t) -1, src, nms);
714 size_t dlen = ret + 1;
716 if (type == HEAP_NOTHEAP)
717 *dst_p = (wchar_t *) calloc (dlen, sizeof (wchar_t));
719 *dst_p = (wchar_t *) ccalloc ((cygheap_types) type, dlen,
723 ret = sys_mbstowcs (*dst_p, dlen, src, nms);
728 static WCHAR hex_wchars[] = L"0123456789abcdef";
731 RtlInt64ToHexUnicodeString (ULONGLONG value, PUNICODE_STRING dest,
734 USHORT len = append ? dest->Length : 0;
735 if (dest->MaximumLength - len < 16 * (int) sizeof (WCHAR))
736 return STATUS_BUFFER_OVERFLOW;
737 wchar_t *end = (PWCHAR) ((PBYTE) dest->Buffer + len);
738 register PWCHAR p = end + 16;
741 *p = hex_wchars[value & 0xf];
744 dest->Length += 16 * sizeof (WCHAR);
745 return STATUS_SUCCESS;