From 97f9c0216cf74ef334037c277c58dc42e16ec406 Mon Sep 17 00:00:00 2001 From: Keith Marshall Date: Tue, 25 Feb 2020 23:08:18 +0000 Subject: [PATCH] Reimplement wcrtomb() and wcsrtombs(); cf. MinGW-Issue #39687. --- mingwrt/ChangeLog | 38 +++++++ mingwrt/Makefile.in | 6 +- mingwrt/include/limits.h | 10 +- mingwrt/include/wchar.h | 61 +++++++++-- mingwrt/mingwex/codeset.c | 111 ++++++++++++++++++++ mingwrt/mingwex/wcharmap.c | 193 +++++++++++++++++++++++++++++++++++ mingwrt/mingwex/wcharmap.h | 91 +++++++++++++++++ mingwrt/mingwex/wcrtomb.c | 240 ++++++++++++++++++++++++++++++-------------- mingwrt/mingwex/wcsrtombs.c | 174 ++++++++++++++++++++++++++++++++ 9 files changed, 831 insertions(+), 93 deletions(-) create mode 100644 mingwrt/mingwex/codeset.c create mode 100644 mingwrt/mingwex/wcharmap.c create mode 100644 mingwrt/mingwex/wcharmap.h create mode 100644 mingwrt/mingwex/wcsrtombs.c diff --git a/mingwrt/ChangeLog b/mingwrt/ChangeLog index 70d3932..bbe6e89 100644 --- a/mingwrt/ChangeLog +++ b/mingwrt/ChangeLog @@ -1,3 +1,41 @@ +2020-02-25 Keith Marshall + + Reimplement wcrtomb() and wcsrtombs(); cf. MinGW-Issue #39687. + + * include/wchar.h [__MSVCRT_VERSION__ < __MSVCR80_DLL] + (wcrtomb, wcsrtombs): Implement them as static inline redirects to... + (__msvcrt_wcrtomb, __msvcrt_wcsrtombs): ...these; declare them. + (__mingw_wcrtomb, __mingw_wcsrtombs): Also declare these. + + * include/limits.h (MB_LEN_MAX): Update value; was 2, but should be 5. + + * mingwex/wcsrtombs.c: New file; it implements... + (__mingw_wcsrtombs, __msvcrt_wcsrtombs): ...this pair of new + functions, either of which serves as a replacement for... + (wcsrtombs): ...this; it was originally implemented... + * mingwex/wcrtomb.c: ...here; rewritten as new, it now implements... + (__mingw_wcrtomb, __msvcrt_wcrtomb): ...these new functions, + either of which serves as a replacement for... + (wcrtomb): ...this. + + * mingwex/wcharmap.h: New private header; it declares the API for... + * mingwex/wcharmap.c: ...this new file, which implements... + (__mingw_wctomb_convert): ...this new function, required by... + (__mingw_wcrtomb, __msvcrt_wcrtomb, __mingw_wcsrtombs) + (__msvcrt_wcsrtombs): ...these; also provides... + (__mingw_wctomb_codeset_init, __mingw_wctomb_cur_max_init) + (__mingw_wctomb_cur_max): ...these supporting thread local storage + accessor functions. + + * mingwex/codeset.c: New file; it implements... + (__mb_codeset_for_locale, __mb_len_max_for_codeset): ...this pair of + new helper functions; they identify the codeset, and respectively, its + MB_CUR_MAX for the effective process locale, which are required by... + (__mingw_wctomb_convert): ...this. + + * Makefile.in (libmingwex.a): Add dependency references for... + (codeset.$OBJEXT, wcharmap.$OBJEXT, wcsrtombs.$OBJEXT): ...these. + 2020-01-21 Keith Marshall Address MinGW-Issue #39658; declare rand_s() function. diff --git a/mingwrt/Makefile.in b/mingwrt/Makefile.in index aa4a00f..d0857fd 100644 --- a/mingwrt/Makefile.in +++ b/mingwrt/Makefile.in @@ -468,9 +468,9 @@ libmingwex.a: $(addsuffix .$(OBJEXT), mkstemp mkdtemp cryptnam setenv) vpath %.s ${mingwrt_srcdir}/mingwex vpath %.sx ${mingwrt_srcdir}/mingwex -libmingwex.a: $(addsuffix .$(OBJEXT), fwide mbrtowc mbsinit strnlen wcrtomb \ - wcsnlen wcstof wcstold wctob wctrans wctype wmemchr wmemcmp wmemcpy wmemmove \ - wmemset) +libmingwex.a: $(addsuffix .$(OBJEXT), codeset fwide mbrtowc mbsinit strnlen \ + wcharmap wcrtomb wcsrtombs wcsnlen wcstof wcstold wctob wctrans wctype wmemchr \ + wmemcmp wmemcpy wmemmove wmemset) # The wcsnlen() function, enumerated above, is an adaptation of strnlen(); # we need a specific rule to compile it, from shared source. diff --git a/mingwrt/include/limits.h b/mingwrt/include/limits.h index 49f7698..4ab1c59 100644 --- a/mingwrt/include/limits.h +++ b/mingwrt/include/limits.h @@ -6,7 +6,7 @@ * $Id$ * * Written by Colin Peters - * Copyright (C) 1997, 1999-2001, 2004, 2005, 2010, 2012, 2017, + * Copyright (C) 1997, 1999-2001, 2004, 2005, 2010, 2012, 2017, 2020, * MinGW.org Project * * @@ -53,11 +53,13 @@ /* Characteristics of the char data type. * - * FIXME: Is MB_LEN_MAX correct? Probably yes, for Microsoft MBCS, which - * effectively seem to all be DBCS. + * FIXME: Is MB_LEN_MAX correct? Earlier Microsoft documentation specified + * it as two, (which would probably have been okay, in the case of only DBCS + * encodings); today (2019), Microsoft's documentation says that five is the + * appropriate value. */ #define CHAR_BIT 8 -#define MB_LEN_MAX 2 +#define MB_LEN_MAX 5 #define SCHAR_MIN (-128) #define SCHAR_MAX 127 diff --git a/mingwrt/include/wchar.h b/mingwrt/include/wchar.h index 7ea78a7..290dc19 100644 --- a/mingwrt/include/wchar.h +++ b/mingwrt/include/wchar.h @@ -8,7 +8,8 @@ * * Unattributed original source. * Adapted by Rob Savoye - * Copyright (C) 1997, 1999-2009, 2011, 2015, 2016, 2018, MinGW.org Project. + * Copyright (C) 1997, 1999-2009, 2011, 2015, 2016, 2018-2020, + * MinGW.org Project. * * * Permission is hereby granted, free of charge, to any person obtaining a @@ -528,15 +529,18 @@ typedef wchar_t _Wint_t; typedef int mbstate_t; -/* The following multi-byte character conversion functions are - * implemented in libmingwex.a, (and maybe also in some non-free - * Microsoft libraries, such as MSVCP60.DLL and later). +/* The following multi-byte character conversion functions have been + * implemented by Microsoft, in non-free MSVCR80.DLL and later, (and + * maybe also in some earlier non-free DLLs, such as MSVCP60.DLL and + * later); they are also available in MSVCRT.DLL, from Vista onward, + * but to provide continuing support for earlier Windows versions, + * we invoke them via MinGW specific wrappers, defined below. */ __cdecl __MINGW_NOTHROW wint_t btowc (int); __cdecl __MINGW_NOTHROW int wctob (wint_t); -__cdecl __MINGW_NOTHROW -size_t mbrlen (const char *__restrict__, size_t, mbstate_t *__restrict__); +__cdecl __MINGW_NOTHROW size_t mbrlen +(const char *__restrict__, size_t, mbstate_t *__restrict__); __cdecl __MINGW_NOTHROW size_t mbrtowc (wchar_t *__restrict__, const char *__restrict__, size_t, mbstate_t *__restrict__); @@ -544,12 +548,53 @@ __cdecl __MINGW_NOTHROW size_t mbrtowc __cdecl __MINGW_NOTHROW size_t mbsrtowcs (wchar_t *__restrict__, const char **__restrict__, size_t, mbstate_t *__restrict__); -__cdecl __MINGW_NOTHROW -size_t wcrtomb (char * __restrict__, wchar_t, mbstate_t *__restrict__); +__cdecl __MINGW_NOTHROW size_t wcrtomb +(char * __restrict__, wchar_t, mbstate_t *__restrict__); __cdecl __MINGW_NOTHROW size_t wcsrtombs (char *__restrict__, const wchar_t **__restrict__, size_t, mbstate_t *__restrict__); +/* To provide support for the above, on legacy Windows versions, + * we implement fall back wrappers in libmingwex.a; each of these + * will delegate to the corresponding Microsoft implementation, if + * it exists in the process address space; otherwise, execution + * will fall back to a MinGW implementation... + */ +__cdecl __MINGW_NOTHROW size_t __msvcrt_wcrtomb +(char * __restrict__, wchar_t, mbstate_t *__restrict__); + +__cdecl __MINGW_NOTHROW size_t __msvcrt_wcsrtombs +(char *__restrict__, const wchar_t **__restrict__, size_t, mbstate_t *__restrict__); + +/* ...whereas, these alternatives will always invoke the MinGW + * fall back implementations, without considering any possible + * reference to MSVCRT.DLL or MSVCR80.DLL implementations. + */ +__cdecl __MINGW_NOTHROW size_t __mingw_wcrtomb +(char * __restrict__, wchar_t, mbstate_t *__restrict__); + +__cdecl __MINGW_NOTHROW size_t __mingw_wcsrtombs +(char *__restrict__, const wchar_t **__restrict__, size_t, mbstate_t *__restrict__); + +#if __MSVCRT_VERSION__ < __MSVCR80_DLL +/* FIXME: Maybe consider these mappings, even for linking with the + * non-free MSVCR80.DLL, and its descendants. + * + * For linking with all versions of MSVCRT.DLL, and with non-free + * alternatives predating MSVCR80.DLL, we enforce inline mapping to + * the libmingwex.a implementations, (which will delegate the calls + * to the Microsoft DLL implementations, when they are available). + */ +__CRT_ALIAS __cdecl __MINGW_NOTHROW size_t wcrtomb +(char * __mbc, wchar_t __wc, mbstate_t *__ps) +{ return __msvcrt_wcrtomb(__mbc, __wc, __ps); } + +__CRT_ALIAS __cdecl __MINGW_NOTHROW size_t wcsrtombs +(char *__mbs, const wchar_t **__wcs, size_t __len, mbstate_t *__ps) +{ return __msvcrt_wcsrtombs(__mbs, __wcs, __len, __ps); } + +#endif /* ! MSVCR80.DLL or later */ + #if defined _ISOC99_SOURCE || defined __cplusplus /* These ISO-C99 functions are implemented in libmingwex.a, * or, in some cases, as inline stubs; while provided as MinGW diff --git a/mingwrt/mingwex/codeset.c b/mingwrt/mingwex/codeset.c new file mode 100644 index 0000000..711a4f4 --- /dev/null +++ b/mingwrt/mingwex/codeset.c @@ -0,0 +1,111 @@ +/* + * codeset.c + * + * Provides implementation-private helper functions, to identify the + * code page which is associated with the active process locale, and to + * establish the effective MB_CUR_MAX value for this code page. + * + * $Id$ + * + * Written by Keith Marshall + * Copyright (C) 2019, 2020, MinGW.org Project + * + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice, this permission notice, and the following + * disclaimer shall be included in all copies or substantial portions of + * the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OF OR OTHER + * DEALINGS IN THE SOFTWARE. + * + */ +#include +#include +#include +#include + +unsigned int __mb_codeset_for_locale( void ); +unsigned int __mb_cur_max_for_codeset( unsigned int ); + +unsigned int __mb_codeset_for_locale( void ) +{ + /* Extract the code page identification string (if any) from the LC_CTYPE + * identification string, as returned in "language[_region[.codeset]]", or + * ".codeset" format, by a setlocale() query on the current locale. + */ + char *default_locale_specification, *codeset_string; + if( (default_locale_specification = setlocale( LC_CTYPE, NULL )) != NULL ) + { + /* An unfortunate -- albeit documented -- limitation of Microsoft's + * setlocale() implementation is that it cannot correctly process any + * locale specification which refers to a MBCS codeset which may use + * more than two bytes for any single code point; to mitigate this, + * when the active locale matches the system default... + */ + char string_buffer[1 + strlen( default_locale_specification )]; + codeset_string = strcpy( string_buffer, default_locale_specification ); + if( strcmp( codeset_string, setlocale( LC_CTYPE, "" )) == 0 ) + { + /* ...although Microsoft's setlocale() doesn't support it, (and + * is neither expected to, nor required to), we may adopt POSIX.1 + * convention, in this particular case, to acquire a preferred + * default locale specification from the environment... + */ + if( ((default_locale_specification = getenv( "LC_ALL" )) != NULL) + || ((default_locale_specification = getenv( "LC_CTYPE" )) != NULL) + || ((default_locale_specification = getenv( "LANG" )) != NULL) ) + + /* ...and use that in place of Microsoft's setlocale() notion + * of the current effective LC_CTYPE locale category. + */ + codeset_string = default_locale_specification; + } + else + { /* The originally active locale does NOT match the system default, + * but we made it do so, by checking, so restore the original. + */ + setlocale( LC_CTYPE, codeset_string ); + } + /* Regardless of how we established the effective LC_CTYPE category + * for the active locale, we may extract its codeset element... + */ + if( (codeset_string = strchr( codeset_string, '.' )) != NULL ) + { + /* ...interpreting the resultant string as its equivalent integer + * value, for validation and return. + */ + unsigned int retval = (unsigned int)(atoi( codeset_string + 1 )); + if( __mb_cur_max_for_codeset( retval ) > 0 ) return retval; + } + } + /* In the event that LC_CTYPE doesn't include a codeset identification, + * return an effective value of zero, which we may later interpret as a + * default representation for the "C" locale. + */ + return 0; +} + +unsigned int __mb_cur_max_for_codeset( unsigned int codeset ) +{ + /* Identify the length of the longest valid multibyte character encoding + * sequence, used within the specified MS-Windows code page, by consulting + * the relevant Win32 API database. Returns the appropriate byte count, + * or zero if the codeset identifier is not valid. + */ + CPINFO codeset_info; + return (GetCPInfo( codeset, &codeset_info )) ? codeset_info.MaxCharSize : 0; +} + +/* $RCSfile$: end of file */ diff --git a/mingwrt/mingwex/wcharmap.c b/mingwrt/mingwex/wcharmap.c new file mode 100644 index 0000000..3c0419a --- /dev/null +++ b/mingwrt/mingwex/wcharmap.c @@ -0,0 +1,193 @@ +/* + * wcharmap.c + * + * Provides an implementation-private helper function, to facilitate + * conversion from UTF-16LE wchar_t data, of arbitrary length, to an + * equivalent multi-byte character encoding sequence. + * + * $Id$ + * + * Written by Keith Marshall + * Copyright (C) 2019, 2020, MinGW.org Project + * + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice, this permission notice, and the following + * disclaimer shall be included in all copies or substantial portions of + * the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OF OR OTHER + * DEALINGS IN THE SOFTWARE. + * + */ +#include "wcharmap.h" + +/* We need a definition of macro UCHAR_MAX; it is defined in + */ +#include + +/* The working codeset, and its associated effective MB_CUR_MAX, + * are stored with file-scope visibility, to facilitate passing + * them to individual elements of the implementation... + */ +static __thread unsigned int codeset, wctomb_cur_max; + +/* ...but, in this instance, we also need to provide a mechanism + * for initializing each of these from the global scope... + */ +unsigned int __mingw_wctomb_codeset_init( void ) +{ return codeset = __mb_codeset_for_locale(); } + +unsigned int __mingw_wctomb_cur_max_init( unsigned int codeset ) +{ return wctomb_cur_max = __mb_cur_max_for_codeset( codeset ); } + +/* ...and also, an accessor to make the effective MB_CUR_MAX + * available in the global scope. + */ +unsigned int __mingw_wctomb_cur_max( void ) +{ return wctomb_cur_max; } + +size_t __mingw_wctomb_convert +( char *mbs, int mblen, const wchar_t *wcs, int wclen ) +{ + /* Helper function to map a sequence of wchars to their corresponding + * sequence of multibyte characters, encoded as is appropriate for the + * specified code page, (which is nominally the code page associated + * with the current locale). + * + * Inputs: + * mbs Buffer in which the encoded multibyte sequence may be + * returned, or NULL, if only the sequence length is to + * be determined, discarding the encoded data. + * + * mblen Number of bytes available in mbs; ignured if mbs is + * passed as NULL. + * + * wcs The sequence of wchars which is to be encoded. + * + * wclen The number of wchars in wcs; if passed as (size_t)(-1), + * scan until (wchar_t)(0), or until a wchar with no valid + * encoding, or space in the encoding buffer is exhausted. + * + * Returns: + * The number of encoded bytes (which would be) stored into mbs, if + * mbs is not NULL, and all specifed wchars in wcs are successfully + * encoded; otherwise, returns (size_t)(-1), and sets errno to: + * + * EILSEQ If encoding is interrupted by a wchar with no valid + * encoding within the specified code page. + * + * ENOMEM The mbs pointer isn't NULL, but there is insufficient + * space in the designated buffer to store the encoded + * multibyte character sequence. + */ + size_t retval; int eilseq_flag = 0; + + if( codeset == 0 ) + { /* Code page zero is assumed to represent the encoding which applies + * within the "C" locale; this is a single-byte encoding, with wchar + * values in the range L'\0'..L'\255' mapped to their identical byte + * values, and all greater wchar values considered to be invalid. + * + * Simply scan, count, and optionally store valid byte values, + * starting from an initial count of zero. + */ + retval = 0; + + if( (size_t)(wclen) == (size_t)(-1) ) + do { /* This is an unbounded scan; simply check that each + * successive wchar lies in the valid range... + */ + if( (unsigned)(*wcs) > UCHAR_MAX ) + /* ...otherwise, report an invalid encoding, and + * bail out. + */ + return errout( EILSEQ, wclen ); + + /* We got a valid input wchar... + */ + if( mbs != NULL ) + { /* ...which we are now expected to store... + */ + if( mblen-- > 0 ) *mbs++ = (unsigned char)(*wcs); + + /* ...but, we must bail out, if there is no + * space left in the encoding buffer. + */ + else return errout( ENOMEM, (size_t)(-1) ); + } + + /* We've accepted the current input wchar; count + * it, and then, provided it isn't the terminating + * NUL, move on to the next. + */ + ++retval; + } while( *wcs++ != L'\0' ); + + else while( wclen-- > 0 ) + { /* This is a bounded scan; as in the unbounded case, take + * each input wchar in turn, and verify that each lies in + * the valid encoding range. + */ + if( (unsigned)(*wcs) > UCHAR_MAX ) + return errout( EILSEQ, (size_t)(-1) ); + + /* We got a valid input wchar... + */ + if( mbs != NULL ) + { /* ...which we are now expected to store... + */ + if( mblen-- > 0 ) *mbs++ = (unsigned char)(*wcs); + + /* ...but, we must bail out, if there is no + * space left in the encoding buffer. + */ + else return errout( ENOMEM, (size_t)(-1) ); + } + + /* Ensure that we don't scan beyond a terminating NUL + * wchar, even if this lies within the bounded count. + */ + if( *wcs++ == L'\0' ) wclen = 0; + + /* In any case, count the current encoded byte. + */ + ++retval; + } + + /* We now have the final count, for a code page zero encoding; + * we are done. + */ + return retval; + } + + /* For any code page other than zero, we delegate both encoding + * and byte counting to the Windows API; note that for code pages + * other than CP_UTF7 or CP_UTF8, (and CP_UTF8 is the only code + * page with an identifier greater than that for CP_UTF7), there + * may be unrepresentable UTF-16 code points, and we must pass a + * flag reference to detect their presence in the UTF-16LE input + * sequence; OTOH, any valid UTF-16 code point is representable + * in both CP_UTF7 and CP_UTF8, so no such flag is required, and + * WideCharToMultiByte() will choke, if the flag reference is + * not passed as NULL. + */ + retval = WideCharToMultiByte( codeset, 0, wcs, wclen, mbs, mblen, NULL, + (CP_UTF7 > codeset) ? &eilseq_flag : NULL + ); + return (eilseq_flag || (retval == 0)) ? errout( EILSEQ, (size_t)(-1) ) + : retval; +} + +/* $RCSfile$: end of file */ diff --git a/mingwrt/mingwex/wcharmap.h b/mingwrt/mingwex/wcharmap.h new file mode 100644 index 0000000..0aa91bb --- /dev/null +++ b/mingwrt/mingwex/wcharmap.h @@ -0,0 +1,91 @@ +/* + * wcharmap.h + * + * Private header file, declaring common components of the MinGW.org + * fallback implementations of wide to multi-byte (and complementary) + * character set conversion API functions. + * + * $Id$ + * + * Written by Keith Marshall + * Copyright (C) 2019, 2020, MinGW.org Project + * + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice, this permission notice, and the following + * disclaimer shall be included in all copies or substantial portions of + * the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OF OR OTHER + * DEALINGS IN THE SOFTWARE. + * + */ +#include +#include +#include +#include + +/* Provide a shorthand notation for declaring functions which + * we would like to always be expanded in line. + */ +#define __mb_inline__ __inline__ __attribute__((__always_inline__)) + +/* Define a pair of inline helper functions, to facilitate preservation + * of the "errno" state on entry, such that it may be restored or modified, + * as necessary for ISO-C99 conformance, on function return. + * + * First, a helper to save, and clear, error state on entry... + */ +static __mb_inline__ +int save_error_status_and_clear (int state, int clear) +{ errno = clear; return state; } + +/* ...and the complementary helper, which may be used to either restore + * the saved state, or to report a new error condition, on return. + */ +static __mb_inline__ +size_t errout (int errcode, size_t status){ errno = errcode; return status; } + +/* Generic codeset management functions. + */ +unsigned int __mb_codeset_for_locale (void); +unsigned int __mb_cur_max_for_codeset (unsigned int); + +/* Codeset initializers, and internal helper functions for + * wide character to multi-byte sequence conversions. + */ +unsigned int __mingw_wctomb_codeset_init (void); +unsigned int __mingw_wctomb_cur_max_init (unsigned int); +size_t __mingw_wctomb_convert (char *, int, const wchar_t *, int); +unsigned int __mingw_wctomb_cur_max (void); + +/* The legacy MinGW implementation used a get_codepage() function, + * which was effectively the same as our __mb_codeset_for_locale(); + * this alias may, eventually, become redundant. + */ +static __mb_inline__ +unsigned int get_codepage( void ){ return __mb_codeset_for_locale(); } + +/* A private helper function, to furnish an internal conversion state + * buffer, for use in any case where a conversion function was called, + * and the caller didn't provide one. + */ +static __mb_inline__ +mbstate_t *__mbrtowc_state( mbstate_t *reference_state ) +{ + static mbstate_t internal_state = (mbstate_t)(0); + return (reference_state == NULL) ? &internal_state : reference_state; +} + +/* $RCSfile$: end of file */ diff --git a/mingwrt/mingwex/wcrtomb.c b/mingwrt/mingwex/wcrtomb.c index 1bc561d..aa6e7d4 100644 --- a/mingwrt/mingwex/wcrtomb.c +++ b/mingwrt/mingwex/wcrtomb.c @@ -1,94 +1,178 @@ -#include "mb_wc_common.h" -#include -#include -#include -#include -#define WIN32_LEAN_AND_MEAN -#include +/* + * wcrtomb.c + * + * MinGW.org replacement for the wcrtomb() function; delegates to the + * Microsoft implementation, if available in the C runtime DLL, otherwise + * handles the call locally. + * + * $Id$ + * + * Written by Keith Marshall + * Copyright (C) 2019, 2020, MinGW.org Project + * + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice, this permission notice, and the following + * disclaimer shall be included in all copies or substantial portions of + * the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OF OR OTHER + * DEALINGS IN THE SOFTWARE. + * + */ +#include "wcharmap.h" +/* For runtime delegation, we need a mechanism for detection of an + * implementation, within the default C runtime DLL; we may use the + * MinGW dlfcn emulation, to facilitate this. + */ +#include -static int __MINGW_ATTRIB_NONNULL(1) - __wcrtomb_cp (char *dst, wchar_t wc, const unsigned int cp, - const unsigned int mb_max) +static __mb_inline__ size_t __wcrtomb_fallback +( char *restrict mb, wchar_t wc, mbstate_t *ps ) { - if (cp == 0) + /* Fallback function, providing an implementation of the wcrtomb() + * function, when none is available within the Microsoft C runtime, + * or the user has explicitly overridden accessibility of any such + * Microsoft implementation. + */ + if( *ps != (mbstate_t)(0) ) + { + /* Microsoft's MBCS implementation does not use shift states; + * however, it is possible that an immediately preceding call + * may have stopped with a dangling high surrogate, and thus + * a restart to complete this, by adding a low surrogate, and + * converting the pair, may be appropriate. + */ + if( IS_HIGH_SURROGATE( *ps ) && IS_LOW_SURROGATE( wc ) ) { - if (wc > 255) - { - errno = EILSEQ; - return -1; - } - *dst = (char) wc; - return 1; + /* Reassemble the surrogate pair, in a local buffer, and + * return its conversion, having reset the restart state. + */ + wchar_t buf[2] = { (wchar_t)(*ps), wc }; *ps = (mbstate_t)(0); + return __mingw_wctomb_convert( mb, __mingw_wctomb_cur_max(), buf, 2 ); } - else - { - int invalid_char = 0; - - int size = WideCharToMultiByte (cp, 0 /* Is this correct flag? */, - &wc, 1, dst, mb_max, - NULL, &invalid_char); - if (size == 0 || invalid_char) - { - errno = EILSEQ; - return -1; - } - return size; + else + { /* We expected a low surrogate, but didn't get one; reset + * the restart state, and abort this conversion. + */ + *ps = (mbstate_t)(0); return errout( EILSEQ, (size_t)(-1) ); } + } + /* When mb is a NULL pointer, ISO-C99 decrees that the call shall + * be interpreted as the equivalent of: + * + * wcrtomb( internal_buffer, L'\0', ps ); + * + * with the encoding of the NUL wchar, preceded by any sequence + * of bytes needed to restore ps to the initial shift state, being + * stored in the internal buffer, (and thus, inaccessible to the + * caller). Since Microsoft's MBCS encodings do not use shift + * states, and the encoding for NUL is always a single NUL byte, + * this becomes the equivalent of returning (size_t)(1). + */ + if( mb == NULL ) return (size_t)(1); + + /* When mb is not a NULL pointer, then we are obliged to assume + * that it points to a buffer of at least MB_CUR_MAX bytes, and + * we may proceed with a normal conversion, (except that, when + * wc lies in the range reserved for surrogates, we must handle + * it as a special case. + */ + if( IS_HIGH_SURROGATE( wc ) ) + { /* A high surrogate is permitted, but it cannot be converted + * at this time; instead, we simply record that it is present, + * (subverting ps for this purpose), and move on, without any + * conversion being performed, and thus storing no converted + * bytes, in the expection that the next wc passed will be a + * low surrogate, thus allowing completion of the conversion. + */ + *ps = (mbstate_t)(wc); return (size_t)(0); + } + if( IS_LOW_SURROGATE( wc ) ) + /* A low surrogate, detected here, is an orphan (not paired + * with a high surrogate from an immediately preceding call); + * this is not permitted, so report it as invalid. + */ + return errout( EILSEQ, (size_t)(-1) ); + + /* If we get this far, we may proceed with conversion; we return + * the byte count, and effect of encoding the single wchar which + * was passed by value in wc. + */ + return __mingw_wctomb_convert( mb, __mingw_wctomb_cur_max(), &wc, 1 ); } -size_t -wcrtomb (char *dst, wchar_t wc, mbstate_t * __UNUSED_PARAM (ps)) +static size_t __mingw_wcrtomb_fallback +( char *restrict mb, wchar_t wc, mbstate_t *ps ) { - char byte_bucket [MB_LEN_MAX]; - char* tmp_dst = dst ? dst : byte_bucket; - return (size_t)__wcrtomb_cp (tmp_dst, wc, get_codepage (), - MB_CUR_MAX); + /* A thin wrapper around the preceding fallback implementation, + * (which is expanded in-line); this serves as the sole interface + * between either of the two following public API entry points, and + * the fallback implementation, ensuring that a private mbstate_t + * reference is provided, if the caller doesn't supply its own. + */ + return __wcrtomb_fallback( mb, wc, __mbrtowc_state( ps ) ); } -size_t wcsrtombs (char *dst, const wchar_t **src, size_t len, - mbstate_t * __UNUSED_PARAM (ps)) +size_t __mingw_wcrtomb +( char *restrict mb, wchar_t wc, mbstate_t *restrict ps ) { - int ret = 0; - size_t n = 0; - const unsigned int cp = get_codepage(); - const unsigned int mb_max = MB_CUR_MAX; - const wchar_t *pwc = *src; + /* Wrapper for the wcrtomb() function; this will unconditionally + * delegate the call to the MinGW fallback implementation, (defined + * above), irrespective of availability of any Microsoft-provided + * implementation. + */ + (void)(__mingw_wctomb_cur_max_init( __mingw_wctomb_codeset_init() )); + return __mingw_wcrtomb_fallback( mb, wc, ps ); +} - if (src == NULL || *src == NULL) /* undefined behavior */ - return 0; +size_t __msvcrt_wcrtomb( char *restrict mb, wchar_t wc, mbstate_t *restrict ps ) +{ + /* Wrapper for the wcrtomb() function; this will initially attempt + * to delegate the call to a Microsoft-provided implementation, but + * if no such implementation can be found, fall back to the MinGW + * substitute (defined above). + */ + typedef size_t (*redirect_t)( char *restrict, wchar_t, mbstate_t *restrict ); + static redirect_t redirector_hook = NULL; - if (dst != NULL) - { - while (n < len) - { - if ((ret = __wcrtomb_cp (dst, *pwc, cp, mb_max)) <= 0) - return (size_t) -1; - n += ret; - dst += ret; - if (*(dst - 1) == '\0') - { - *src = (wchar_t*) NULL;; - return (n - 1); - } - pwc++; - } - *src = pwc; - } - else - { - char byte_bucket [MB_LEN_MAX]; - while (n < len) - { - if ((ret = __wcrtomb_cp (byte_bucket, *pwc, cp, mb_max)) - <= 0) - return (size_t) -1; - n += ret; - if (byte_bucket [ret - 1] == '\0') - return (n - 1); - pwc++; - } - } + /* MSVCRT.DLL's setlocale() cannot reliably handle code pages with + * more than two bytes per code point, (e.g. UTF-7 and UTF-8); thus, + * Microsoft's wcrtomb() is likely to be similarly unreliable, so + * always use the MinGW fallback with such code pages. + */ + if( (__mingw_wctomb_cur_max_init( __mingw_wctomb_codeset_init() )) > 2 ) + return __mingw_wcrtomb_fallback( mb, wc, ps ); - return n; + /* On first time call, we don't know which implementation is to be + * selected; look for a Microsoft implementation, which, if available, + * may be registered for immediate use on this, and any subsequent, + * calls to this function wrapper... + */ + if( (redirector_hook == NULL) + && ((redirector_hook = dlsym( RTLD_DEFAULT, "wcrtomb" )) == NULL) ) + + /* ...but when no Microsoft implementation can be found, register + * the MinGW fall back in its stead. + */ + redirector_hook = __mingw_wcrtomb_fallback; + + /* Finally, delegate the call to whichever implementation has been + * registered on first-time call. + */ + return redirector_hook( mb, wc, ps ); } + +/* $RCSfile$: end of file */ diff --git a/mingwrt/mingwex/wcsrtombs.c b/mingwrt/mingwex/wcsrtombs.c new file mode 100644 index 0000000..80bfee1 --- /dev/null +++ b/mingwrt/mingwex/wcsrtombs.c @@ -0,0 +1,174 @@ +/* + * wcsrtombs.c + * + * MinGW.org replacement for the wcsrtombs() function; delegates to the + * Microsoft implementation, if available in the C runtime DLL, otherwise + * handles the call locally. + * + * $Id$ + * + * Written by Keith Marshall + * Copyright (C) 2019, 2020, MinGW.org Project + * + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice, this permission notice, and the following + * disclaimer shall be included in all copies or substantial portions of + * the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OF OR OTHER + * DEALINGS IN THE SOFTWARE. + * + */ +#include "wcharmap.h" + +/* For runtime delegation, we need a mechanism for detection of an + * implementation, within the default C runtime DLL; we may use the + * MinGW dlfcn emulation, to facilitate this. + */ +#include + +static size_t __mingw_wcsrtombs_fallback +( char *restrict mbs, const wchar_t **restrict wcs, size_t len, + mbstate_t *__UNUSED_PARAM(ps) +) +{ /* Fallback function, providing an implementation of the wcsrtombs() + * function, when none is available within the Microsoft C runtime, or + * the user has elected to override any such Microsoft implementation. + * + * Initially, save the current errno state, so that we may restore + * it on return, clear it to zero for internal checking, and compute + * the size of buffer required to accommodate the conversion. + */ + int errno_reset = save_error_status_and_clear( errno, 0 ); + size_t wanted = __mingw_wctomb_convert( NULL, 0, *wcs, -1 ); + + if( mbs == NULL ) + /* There is no buffer designated to store the encoded multibyte + * character sequence; we are only interested in the size of the + * buffer which would otherwise be required, and we've already + * determined that, so simply return it. + */ + return (errno == 0) ? errout( errno_reset, wanted - 1 ) : wanted; + + if( (errno == 0) && (len >= wanted) ) + { /* There is an encoding buffer designated, its size is sufficient + * to accommodate the encoding of the entire NUL terminated input + * sequence, and there was no incipient encoding error during the + * initial minimum buffer size determination; encode the entire + * input sequence for return, and clean up the input state. + */ + len = __mingw_wctomb_convert( mbs, len, *wcs, -1 ) - 1; + *wcs = NULL; + } + + else + { /* There is an encoding buffer designated, but either it is too + * small, or a incipient encoding error has been detected; rescan + * the input sequence, encoding one code point at a time, until we + * either exhaust the encoding buffer space, or we encounter the + * encoding error previously identified. + */ + size_t count = 0; errno = 0; + while( (len >= __mingw_wctomb_convert( NULL, 0, *wcs, 1 )) && (errno == 0) ) + { + /* There is still sufficient space to store the encoding of one + * more input code point, and we haven't yet fallen foul of any + * incipient encoding error; store this encoding, and adjust to + * prepare for the next. + */ + size_t step = __mingw_wctomb_convert( mbs, len, (*wcs)++, 1 ); + count += step; len -= step; mbs += step; + } + + /* Check that we didn't fall foul of any incipient encoding error; + * if we did, then we must bail out. + */ + if( errno != 0 ) return (size_t)(-1); + + /* If we're still here, then we've encoded as much of the input + * sequence as we can accommodate; the input pointer has already + * been adjusted, as required, but we must preserve the count of + * cumulatively encoded bytes, for return. + */ + len = count; + } + + /* We have now successfully encoded as much of the input sequence + * as possible, without encountering any encoding error; restore + * the saved errno state, and return the encoded byte count. + */ + return errout( errno_reset, len ); +} + +size_t __mingw_wcsrtombs +( char *mbs, const wchar_t **wcs, size_t len, mbstate_t *ps ) +{ + /* Wrapper for the wcsrtombs() function; this will unconditionally + * delegate the call to the MinGW fallback implementation, (defined + * above), after first ensuring that the specified wcs reference is + * valid, and that the effective codeset has been initialized. + */ + if( (wcs == NULL) || (*wcs == NULL) ) return errout( EINVAL, (size_t)(-1) ); + + (void)(__mingw_wctomb_codeset_init() ); + return __mingw_wcsrtombs_fallback( mbs, wcs, len, ps ); +} + +size_t __msvcrt_wcsrtombs +( char *mbs, const wchar_t **wcs, size_t len, mbstate_t *ps ) +{ + /* Wrapper for the wcsrtombs() function; it will initially attempt + * to delegate the call to a Microsoft-provided implementation, but + * if no such implementation can be found, fall back to the MinGW + * substitute (defined above). + */ + typedef size_t (*redirect_t)(char *, const wchar_t **, size_t, mbstate_t *); + static redirect_t redirector_hook = NULL; + + /* Neither wcs, not the pointer to which it refers, may be NULL. + * ISO C doesn't specify any particular outcome for this condition, + * (so a segmentation fault would conform); it makes more sense to + * catch the abnormality, and bail out. + */ + if( (wcs == NULL) || (*wcs == NULL) ) return errout( EINVAL, (size_t)(-1) ); + + /* MSVCRT.DLL's setlocale() cannot reliably handle code pages with + * more than two bytes per code point, (e.g. UTF-7 and UTF-8); thus, + * Microsoft's wcsrtombs() is likely to be similarly unreliable, so + * always use the MinGW fallback with such code pages. + */ + if( __mb_cur_max_for_codeset( __mingw_wctomb_codeset_init() ) > 2 ) + return __mingw_wcsrtombs_fallback( mbs, wcs, len, ps ); + + /* On first time call, we don't know which implementation is to be + * selected; look for a Microsoft implementation, which, if available, + * may be registered for immediate use on this, and any subsequent, + * calls to this function wrapper... + */ + if( (redirector_hook == NULL) + && ((redirector_hook = dlsym( RTLD_DEFAULT, "wcsrtombs" )) == NULL) ) + { + /* ...but when no Microsoft implementation can be found, register + * the MinGW fallback in its stead. + */ + redirector_hook = __mingw_wcsrtombs_fallback; + } + /* Finally, delegate the call to whichever implementation has been + * registered on first-time call. + */ + return redirector_hook( mbs, wcs, len, ps ); +} + +/* $RCSfile$: end of file */ -- 2.11.0