X-Git-Url: http://git.osdn.net/view?a=blobdiff_plain;f=winsup%2Fcygwin%2Fstrfuncs.cc;fp=winsup%2Fcygwin%2Fstrfuncs.cc;h=61df6505a03f53e52484072cc053baad2ce44524;hb=ae4bf010374a9320497af260fa90af3fe8e2c5a5;hp=0000000000000000000000000000000000000000;hpb=3cc729069938336ea54d399c4bbbe7d197295f9a;p=pf3gnuchains%2Fpf3gnuchains3x.git diff --git a/winsup/cygwin/strfuncs.cc b/winsup/cygwin/strfuncs.cc new file mode 100644 index 0000000000..61df6505a0 --- /dev/null +++ b/winsup/cygwin/strfuncs.cc @@ -0,0 +1,746 @@ +/* strfuncs.cc: misc funcs that don't belong anywhere else + + Copyright 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, + 2005, 2006, 2007, 2008, 2009 Red Hat, Inc. + +This file is part of Cygwin. + +This software is a copyrighted work licensed under the terms of the +Cygwin license. Please consult the file "CYGWIN_LICENSE" for +details. */ + +#include "winsup.h" +#include +#include +#include +#include +#include "cygerrno.h" +#include "security.h" +#include "path.h" +#include "fhandler.h" +#include "dtable.h" +#include "cygheap.h" +#include "tls_pbuf.h" + +/* Transform characters invalid for Windows filenames to the Unicode private + use area in the U+f0XX range. The affected characters are all control + chars 1 <= c <= 31, as well as the characters " * : < > ? |. The backslash + is affected as well, but we can't transform it as long as we accept Win32 + paths as input. + The reverse functionality is in function sys_cp_wcstombs. */ +static const WCHAR tfx_chars[] = { + 0, 0xf000 | 1, 0xf000 | 2, 0xf000 | 3, + 0xf000 | 4, 0xf000 | 5, 0xf000 | 6, 0xf000 | 7, + 0xf000 | 8, 0xf000 | 9, 0xf000 | 10, 0xf000 | 11, + 0xf000 | 12, 0xf000 | 13, 0xf000 | 14, 0xf000 | 15, + 0xf000 | 16, 0xf000 | 17, 0xf000 | 18, 0xf000 | 19, + 0xf000 | 20, 0xf000 | 21, 0xf000 | 22, 0xf000 | 23, + 0xf000 | 24, 0xf000 | 25, 0xf000 | 26, 0xf000 | 27, + 0xf000 | 28, 0xf000 | 29, 0xf000 | 30, 0xf000 | 31, + ' ', '!', 0xf000 | '"', '#', + '$', '%', '&', 39, + '(', ')', 0xf000 | '*', '+', + ',', '-', '.', '\\', + '0', '1', '2', '3', + '4', '5', '6', '7', + '8', '9', 0xf000 | ':', ';', + 0xf000 | '<', '=', 0xf000 | '>', 0xf000 | '?', + '@', 'A', 'B', 'C', + 'D', 'E', 'F', 'G', + 'H', 'I', 'J', 'K', + 'L', 'M', 'N', 'O', + 'P', 'Q', 'R', 'S', + 'T', 'U', 'V', 'W', + 'X', 'Y', 'Z', '[', + '\\', ']', '^', '_', + '`', 'a', 'b', 'c', + 'd', 'e', 'f', 'g', + 'h', 'i', 'j', 'k', + 'l', 'm', 'n', 'o', + 'p', 'q', 'r', 's', + 't', 'u', 'v', 'w', + 'x', 'y', 'z', '{', + 0xf000 | '|', '}', '~', 127 +}; + +void +transform_chars (PWCHAR path, PWCHAR path_end) +{ + for (; path <= path_end; ++path) + if (*path < 128) + *path = tfx_chars[*path]; +} + +/* The SJIS, JIS and eucJP conversion in newlib does not use UTF as + wchar_t character representation. That's unfortunate for us since + we require UTF for the OS. What we do here is to have our own + implementation of the base functions for the conversion using + the MulitByteToWideChar/WideCharToMultiByte functions. */ + +/* FIXME: We can't support JIS (ISO-2022-JP) at all right now. It's a + stateful charset encoding. The translation from mbtowc to + MulitByteToWideChar is quite complex. Given that we support SJIS and + eucJP, the both most used Japanese charset encodings, this shouldn't + be such a big problem. */ + +/* GBK, eucKR, and Big5 conversions are not available so far in newlib. */ + +static int +__db_wctomb (struct _reent *r, char *s, wchar_t wchar, UINT cp) +{ + if (s == NULL) + return 0; + + if (wchar < 0x80) + { + *s = (char) wchar; + return 1; + } + + BOOL def_used = false; + int ret = WideCharToMultiByte (cp, WC_NO_BEST_FIT_CHARS, &wchar, 1, s, + 2, NULL, &def_used); + if (ret > 0 && !def_used) + return ret; + + r->_errno = EILSEQ; + return -1; +} + +extern "C" int +__sjis_wctomb (struct _reent *r, char *s, wchar_t wchar, const char *charset, + mbstate_t *state) +{ + return __db_wctomb (r,s, wchar, 932); +} + +extern "C" int +__jis_wctomb (struct _reent *r, char *s, wchar_t wchar, const char *charset, + mbstate_t *state) +{ + /* FIXME: See comment at start of file. */ + return __ascii_wctomb (r, s, wchar, charset, state); +} + +extern "C" int +__eucjp_wctomb (struct _reent *r, char *s, wchar_t wchar, const char *charset, + mbstate_t *state) +{ + /* Unfortunately, the Windows eucJP codepage 20932 is not really 100% + compatible to eucJP. It's a cute approximation which makes it a + doublebyte codepage. + The JIS-X-0212 three byte codes (0x8f,0xa1-0xfe,0xa1-0xfe) are folded + into two byte codes as follows: The 0x8f is stripped, the next byte is + taken as is, the third byte is mapped into the lower 7-bit area by + masking it with 0x7f. So, for instance, the eucJP code 0x8f,0xdd,0xf8 + becomes 0xdd,0x78 in CP 20932. + + To be really eucJP compatible, we have to map the JIS-X-0212 characters + between CP 20932 and eucJP ourselves. */ + if (s == NULL) + return 0; + + if (wchar < 0x80) + { + *s = (char) wchar; + return 1; + } + + BOOL def_used = false; + int ret = WideCharToMultiByte (20932, WC_NO_BEST_FIT_CHARS, &wchar, 1, s, + 3, NULL, &def_used); + if (ret > 0 && !def_used) + { + /* CP20932 representation of JIS-X-0212 character? */ + if (ret == 2 && (unsigned char) s[1] <= 0x7f) + { + /* Yes, convert to eucJP three byte sequence */ + s[2] = s[1] | 0x80; + s[1] = s[0]; + s[0] = 0x8f; + ++ret; + } + return ret; + } + + r->_errno = EILSEQ; + return -1; +} + +extern "C" int +__gbk_wctomb (struct _reent *r, char *s, wchar_t wchar, const char *charset, + mbstate_t *state) +{ + return __db_wctomb (r,s, wchar, 936); +} + +extern "C" int +__kr_wctomb (struct _reent *r, char *s, wchar_t wchar, const char *charset, + mbstate_t *state) +{ + return __db_wctomb (r,s, wchar, 949); +} + +extern "C" int +__big5_wctomb (struct _reent *r, char *s, wchar_t wchar, const char *charset, + mbstate_t *state) +{ + return __db_wctomb (r,s, wchar, 950); +} + +static int +__db_mbtowc (struct _reent *r, wchar_t *pwc, const char *s, size_t n, UINT cp, + mbstate_t *state) +{ + wchar_t dummy; + int ret; + + if (s == NULL) + return 0; /* not state-dependent */ + + if (n == 0) + return -2; + + if (pwc == NULL) + pwc = &dummy; + + if (state->__count == 0) + { + if (*(unsigned char *) s < 0x80) + { + *pwc = *(unsigned char *) s; + return *s ? 1 : 0; + } + size_t cnt = min (n, 2); + ret = MultiByteToWideChar (cp, MB_ERR_INVALID_CHARS, s, cnt, pwc, 1); + if (ret) + return cnt; + if (n == 1) + { + state->__count = n; + state->__value.__wchb[0] = *s; + return -2; + } + /* These Win32 functions are really crappy. Assuming n is 2 but the + first byte is a singlebyte charcode, the function does not convert + that byte and return 1, rather it just returns 0. So, what we do + here is to check if the first byte returns a valid value... */ + else if (MultiByteToWideChar (cp, MB_ERR_INVALID_CHARS, s, 1, pwc, 1)) + return 1; + r->_errno = EILSEQ; + return -1; + } + state->__value.__wchb[state->__count] = *s; + ret = MultiByteToWideChar (cp, MB_ERR_INVALID_CHARS, + (const char *) state->__value.__wchb, 2, pwc, 1); + if (!ret) + { + r->_errno = EILSEQ; + return -1; + } + state->__count = 0; + return 1; +} + +extern "C" int +__sjis_mbtowc (struct _reent *r, wchar_t *pwc, const char *s, size_t n, + const char *charset, mbstate_t *state) +{ + return __db_mbtowc (r, pwc, s, n, 932, state); +} + +extern "C" int +__jis_mbtowc (struct _reent *r, wchar_t *pwc, const char *s, size_t n, + const char *charset, mbstate_t *state) +{ + /* FIXME: See comment at start of file. */ + return __ascii_mbtowc (r, pwc, s, n, charset, state); +} + +extern "C" int +__eucjp_mbtowc (struct _reent *r, wchar_t *pwc, const char *s, size_t n, + const char *charset, mbstate_t *state) +{ + /* See comment in __eucjp_wctomb above. */ + wchar_t dummy; + int ret = 0; + + if (s == NULL) + return 0; /* not state-dependent */ + + if (n == 0) + return -2; + + if (pwc == NULL) + pwc = &dummy; + + if (state->__count == 0) + { + if (*(unsigned char *) s < 0x80) + { + *pwc = *(unsigned char *) s; + return *s ? 1 : 0; + } + if (*(unsigned char *) s == 0x8f) /* JIS-X-0212 lead byte? */ + { + /* Yes. Store sequence in mbstate and handle in the __count != 0 + case at the end of the function. */ + size_t i; + for (i = 0; i < 3 && i < n; i++) + state->__value.__wchb[i] = s[i]; + if ((state->__count = i) < 3) /* Incomplete sequence? */ + return -2; + ret = 3; + goto jis_x_0212; + } + size_t cnt = min (n, 2); + if (MultiByteToWideChar (20932, MB_ERR_INVALID_CHARS, s, cnt, pwc, 1)) + return cnt; + if (n == 1) + { + state->__count = 1; + state->__value.__wchb[0] = *s; + return -2; + } + else if (MultiByteToWideChar (20932, MB_ERR_INVALID_CHARS, s, 1, pwc, 1)) + return 1; + r->_errno = EILSEQ; + return -1; + } + state->__value.__wchb[state->__count++] = *s; + ret = 1; +jis_x_0212: + if (state->__value.__wchb[0] == 0x8f) + { + if (state->__count == 2) + { + if (n == 1) + return -2; + state->__value.__wchb[state->__count] = s[1]; + ret = 2; + } + /* Ok, we have a full JIS-X-0212 sequence in mbstate. Convert it + to the CP 20932 representation and feed it to MultiByteToWideChar. */ + state->__value.__wchb[0] = state->__value.__wchb[1]; + state->__value.__wchb[1] = state->__value.__wchb[2] & 0x7f; + } + if (!MultiByteToWideChar (20932, MB_ERR_INVALID_CHARS, + (const char *) state->__value.__wchb, 2, pwc, 1)) + { + r->_errno = EILSEQ; + return -1; + } + state->__count = 0; + return ret; +} + +extern "C" int +__gbk_mbtowc (struct _reent *r, wchar_t *pwc, const char *s, size_t n, + const char *charset, mbstate_t *state) +{ + return __db_mbtowc (r, pwc, s, n, 936, state); +} + +extern "C" int +__kr_mbtowc (struct _reent *r, wchar_t *pwc, const char *s, size_t n, + const char *charset, mbstate_t *state) +{ + return __db_mbtowc (r, pwc, s, n, 949, state); +} + +extern "C" int +__big5_mbtowc (struct _reent *r, wchar_t *pwc, const char *s, size_t n, + const char *charset, mbstate_t *state) +{ + return __db_mbtowc (r, pwc, s, n, 950, state); +} + +/* Convert Windows codepage to a setlocale compatible character set code. + Called from newlib's setlocale() with codepage set to 0, if the + charset isn't given explicitely in the POSIX compatible locale specifier. + The function also returns a pointer to the corresponding _mbtowc_r + function. */ +extern "C" mbtowc_p +__set_charset_from_codepage (UINT cp, char *charset) +{ + if (cp == 0) + cp = GetACP (); + switch (cp) + { + case 437: + case 720: + case 737: + case 775: + case 850: + case 852: + case 855: + case 857: + case 858: + case 862: + case 866: + case 874: + case 1125: + case 1250: + case 1251: + case 1252: + case 1253: + case 1254: + case 1255: + case 1256: + case 1257: + case 1258: + case 20866: + case 21866: + __small_sprintf (charset, "CP%u", cp); + return __cp_mbtowc; + case 28591: + case 28592: + case 28593: + case 28594: + case 28595: + case 28596: + case 28597: + case 28598: + case 28599: + case 28603: + case 28605: + __small_sprintf (charset, "ISO-8859-%u", cp - 28590); + return __iso_mbtowc; + case 932: + strcpy (charset, "SJIS"); + return __sjis_mbtowc; + case 936: + strcpy (charset, "GBK"); + return __gbk_mbtowc; + case 949: + case 51949: + strcpy (charset, "EUCKR"); + return __kr_mbtowc; + case 950: + strcpy (charset, "BIG5"); + return __big5_mbtowc; + case 50220: + strcpy (charset, "JIS"); + return __jis_mbtowc; + case 20932: + case 51932: + strcpy (charset, "EUCJP"); + return __eucjp_mbtowc; + case 65001: + strcpy (charset, "UTF-8"); + return __utf8_mbtowc; + default: + break; + } + strcpy (charset, "ASCII"); + return __ascii_mbtowc; +} + +/* Our own sys_wcstombs/sys_mbstowcs functions differ from the + wcstombs/mbstowcs API in three ways: + + - The UNICODE private use area is used in filenames to specify + characters not allowed in Windows filenames ('*', '?', etc). + The sys_wcstombs converts characters in the private use area + back to the corresponding ASCII chars. + + - If a wide character in a filename has no representation in the current + multibyte charset, then usually you wouldn't be able to access the + file. To fix this problem, sys_wcstombs creates a replacement multibyte + sequences for the non-representable wide-char. The sequence starts with + an ASCII CAN (0x18, Ctrl-X), followed by the UTF-8 representation of the + character. The sys_(cp_)mbstowcs function detects ASCII CAN characters + in the input multibyte string and converts the following multibyte + sequence in by treating it as an UTF-8 char. If that fails, the ASCII + CAN was probably standalone and it gets just copied over as ASCII CAN. + + - The functions always create 0-terminated results, no matter what. + If the result is truncated due to buffer size, it's a bug in Cygwin + and the buffer in the calling function should be raised. */ +size_t __stdcall +sys_cp_wcstombs (wctomb_p f_wctomb, const char *charset, char *dst, size_t len, + const wchar_t *src, size_t nwc) +{ + char buf[10]; + char *ptr = dst; + wchar_t *pwcs = (wchar_t *) src; + size_t n = 0; + mbstate_t ps; + save_errno save; + + memset (&ps, 0, sizeof ps); + if (dst == NULL) + len = (size_t) -1; + while (n < len && nwc-- > 0) + { + wchar_t pw = *pwcs; + int bytes; + unsigned char cwc; + + /* Convert UNICODE private use area. Reverse functionality for the + ASCII area <= 0x7f (only for path names) is transform_chars above. + Reverse functionality for invalid bytes in a multibyte sequence is + in sys_cp_mbstowcs below. */ + if ((pw & 0xff00) == 0xf000 + && (((cwc = (pw & 0xff)) <= 0x7f && tfx_chars[cwc] >= 0xf000) + || (cwc >= 0x80 && MB_CUR_MAX > 1))) + { + buf[0] = (char) cwc; + bytes = 1; + } + else + { + bytes = f_wctomb (_REENT, buf, pw, charset, &ps); + if (bytes == -1 && *charset != 'U'/*TF-8*/) + { + /* Convert chars invalid in the current codepage to a sequence + ASCII CAN; UTF-8 representation of invalid char. */ + buf[0] = 0x18; /* ASCII CAN */ + bytes = __utf8_wctomb (_REENT, buf + 1, pw, charset, &ps); + if (bytes == -1) + { + ++pwcs; + ps.__count = 0; + continue; + } + ++bytes; /* Add the ASCII CAN to the byte count. */ + if (ps.__count == -4 && nwc > 0) + { + /* First half of a surrogate pair. */ + ++pwcs; + if ((*pwcs & 0xfc00) != 0xdc00) /* Invalid second half. */ + { + ++pwcs; + ps.__count = 0; + continue; + } + bytes += __utf8_wctomb (_REENT, buf + bytes, *pwcs, charset, + &ps); + nwc--; + } + } + } + if (n + bytes <= len) + { + n += bytes; + if (dst) + { + for (int i = 0; i < bytes; ++i) + *ptr++ = buf[i]; + } + if (*pwcs++ == 0x00) + break; + } + else + break; + } + if (n && dst) + { + n = (n < len) ? n : len - 1; + dst[n] = '\0'; + } + + return n; +} + +size_t __stdcall +sys_wcstombs (char *dst, size_t len, const wchar_t * src, size_t nwc) +{ + return sys_cp_wcstombs (cygheap->locale.wctomb, cygheap->locale.charset, + dst, len, src, nwc); +} + +/* Allocate a buffer big enough for the string, always including the + terminating '\0'. The buffer pointer is returned in *dst_p, the return + value is the number of bytes written to the buffer, as usual. + The "type" argument determines where the resulting buffer is stored. + It's either one of the cygheap_types values, or it's "HEAP_NOTHEAP". + In the latter case the allocation uses simple calloc. + + Note that this code is shared by cygserver (which requires it via + __small_vsprintf) and so when built there plain calloc is the + only choice. */ +size_t __stdcall +sys_wcstombs_alloc (char **dst_p, int type, const wchar_t *src, size_t nwc) +{ + size_t ret; + + ret = sys_wcstombs (NULL, (size_t) -1, src, nwc); + if (ret > 0) + { + size_t dlen = ret + 1; + + if (type == HEAP_NOTHEAP) + *dst_p = (char *) calloc (dlen, sizeof (char)); + else + *dst_p = (char *) ccalloc ((cygheap_types) type, dlen, sizeof (char)); + if (!*dst_p) + return 0; + ret = sys_wcstombs (*dst_p, dlen, src, nwc); + } + return ret; +} + +/* sys_cp_mbstowcs is actually most of the time called as sys_mbstowcs with + a 0 codepage. If cp is not 0, the codepage is evaluated and used for the + conversion. This is so that fhandler_console can switch to an alternate + charset, which is the charset returned by GetConsoleCP (). Most of the + time this is used for box and line drawing characters. */ +size_t __stdcall +sys_cp_mbstowcs (mbtowc_p f_mbtowc, const char *charset, wchar_t *dst, + size_t dlen, const char *src, size_t nms) +{ + wchar_t *ptr = dst; + unsigned const char *pmbs = (unsigned const char *) src; + size_t count = 0; + size_t len = dlen; + int bytes; + mbstate_t ps; + save_errno save; + + memset (&ps, 0, sizeof ps); + if (dst == NULL) + len = (size_t)-1; + while (len > 0 && nms > 0) + { + /* ASCII CAN handling. */ + if (*pmbs == 0x18) + { + /* Sanity check: If this is a lead CAN byte for a following UTF-8 + sequence, there must be at least two more bytes left, and the + next byte must be a valid UTF-8 start byte. If the charset + isn't UTF-8 anyway, try to convert the following bytes as UTF-8 + sequence. */ + if (nms > 2 && pmbs[1] >= 0xc2 && pmbs[1] <= 0xf4 && *charset != 'U'/*TF-8*/) + { + bytes = __utf8_mbtowc (_REENT, ptr, (const char *) pmbs + 1, + nms - 1, charset, &ps); + if (bytes < 0) + { + /* Invalid UTF-8 sequence? Treat the ASCII CAN character as + stand-alone ASCII CAN char. */ + bytes = 1; + if (dst) + *ptr = 0x18; + memset (&ps, 0, sizeof ps); + } + else + { + ++bytes; /* Count CAN byte */ + if (bytes > 1 && ps.__count == 4) + { + /* First half of a surrogate. */ + wchar_t *ptr2 = dst ? ptr + 1 : NULL; + int bytes2 = __utf8_mbtowc (_REENT, ptr2, + (const char *) pmbs + bytes, + nms - bytes, charset, &ps); + if (bytes2 < 0) + memset (&ps, 0, sizeof ps); + else + { + bytes += bytes2; + ++count; + ptr = dst ? ptr + 1 : NULL; + --len; + } + } + } + } + /* Otherwise it's just a simple ASCII CAN. */ + else + { + bytes = 1; + if (dst) + *ptr = 0x18; + } + } + else if ((bytes = f_mbtowc (_REENT, ptr, (const char *) pmbs, nms, + charset, &ps)) < 0) + { + /* The technique is based on a discussion here: + http://www.mail-archive.com/linux-utf8@nl.linux.org/msg00080.html + + Invalid bytes in a multibyte secuence are converted to + the private use area which is already used to store ASCII + chars invalid in Windows filenames. This technque allows + to store them in a symmetric way. */ + bytes = 1; + if (dst) + *ptr = L'\xf000' | *pmbs; + memset (&ps, 0, sizeof ps); + } + + if (bytes > 0) + { + pmbs += bytes; + nms -= bytes; + ++count; + ptr = dst ? ptr + 1 : NULL; + --len; + } + else + { + if (bytes == 0) + ++count; + break; + } + } + + if (count && dst) + { + count = (count < dlen) ? count : dlen - 1; + dst[count] = L'\0'; + } + + return count; +} + +size_t __stdcall +sys_mbstowcs (wchar_t * dst, size_t dlen, const char *src, size_t nms) +{ + return sys_cp_mbstowcs (cygheap->locale.mbtowc, cygheap->locale.charset, + dst, dlen, src, nms); +} + +/* Same as sys_wcstombs_alloc, just backwards. */ +size_t __stdcall +sys_mbstowcs_alloc (wchar_t **dst_p, int type, const char *src, size_t nms) +{ + size_t ret; + + ret = sys_mbstowcs (NULL, (size_t) -1, src, nms); + if (ret > 0) + { + size_t dlen = ret + 1; + + if (type == HEAP_NOTHEAP) + *dst_p = (wchar_t *) calloc (dlen, sizeof (wchar_t)); + else + *dst_p = (wchar_t *) ccalloc ((cygheap_types) type, dlen, + sizeof (wchar_t)); + if (!*dst_p) + return 0; + ret = sys_mbstowcs (*dst_p, dlen, src, nms); + } + return ret; +} + +static WCHAR hex_wchars[] = L"0123456789abcdef"; + +NTSTATUS NTAPI +RtlInt64ToHexUnicodeString (ULONGLONG value, PUNICODE_STRING dest, + BOOLEAN append) +{ + USHORT len = append ? dest->Length : 0; + if (dest->MaximumLength - len < 16 * (int) sizeof (WCHAR)) + return STATUS_BUFFER_OVERFLOW; + wchar_t *end = (PWCHAR) ((PBYTE) dest->Buffer + len); + register PWCHAR p = end + 16; + while (p-- > end) + { + *p = hex_wchars[value & 0xf]; + value >>= 4; + } + dest->Length += 16 * sizeof (WCHAR); + return STATUS_SUCCESS; +}