4 * Implementation of back-end MBCS to wchar_t conversion infrastructure
5 * routines to support the MinGW mbrlen(), and mbrtowc() functions.
10 * Written by Keith Marshall <keith@users.osdn.me>
11 * Copyright (C) 2020, MinGW.org Project
14 * Permission is hereby granted, free of charge, to any person obtaining a
15 * copy of this software and associated documentation files (the "Software"),
16 * to deal in the Software without restriction, including without limitation
17 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
18 * and/or sell copies of the Software, and to permit persons to whom the
19 * Software is furnished to do so, subject to the following conditions:
21 * The above copyright notice, this permission notice, and the following
22 * disclaimer shall be included in all copies or substantial portions of
25 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
26 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
27 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
28 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
29 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
30 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OF OR OTHER
31 * DEALINGS IN THE SOFTWARE.
36 /* We use the MB_LEN_MAX macro, to declare the size of internal
37 * MBCS storage buffers; it is defined in <limits.h>
41 static __mb_inline__ size_t mbrlen_min
42 ( const char *restrict s, size_t n, wchar_t *restrict wc )
44 /* Internal helper function to determine the minimum number of
45 * initial bytes, within a MBCS sequence, which are required to
46 * represent a single wide character code point, (which may be
47 * represented as a single wchar_t entity, or alternatively as
48 * a surrogate pair of two such wchar_t entities). At most,
49 * the routine will examine the initial "n" bytes of the given
50 * MBCS sequence, (with "n" nominally limited to the effective
51 * MB_CUR_MAX for the specified codeset).
54 do { status = __mingw_mbtowc_convert( s, len, wc, 2 );
55 } while( (status == 0) && (n >= ++len) );
59 size_t __mingw_mbrtowc_handler
60 ( wchar_t *restrict pwc, const char *restrict s, size_t n,
61 mbstate_t *restrict ps
63 { /* Common handler for MinGW mbrtowc() and mbrlen() functions.
65 (void)(__mingw_mbrlen_cur_max_init( __mingw_mbrtowc_codeset_init() ));
66 { union { mbstate_t st; wchar_t wc[2]; } retval;
67 union { mbstate_t st; char mb[MB_LEN_MAX]; wchar_t wc[2]; } state = { *ps };
68 unsigned int mbrlen_cur_max = __mingw_mbrlen_cur_max();
69 size_t pending, len = 0, count = 0;
71 /* Any residual state, from a preceding call, has been captured
72 * in the local "state" union; assume that this call will clear
73 * any such state, leaving no further residual.
77 /* Normally, it makes no sense to call mbrlen(), or mbrtowc(),
78 * with a look-ahead byte count limit of zero; however, due to
79 * the constraints imposed by MS-Windows using UTF-16LE as the
80 * underlying encoding for wchar_t...
83 { /* ...we allow this, as a special case, so that, when any
84 * immediately preceding call to mbrtowc() has returned a
85 * high surrogate, the accompanying low surrogate...
87 if( IS_SURROGATE_PAIR( state.wc[0], state.wc[1] ) )
89 /* ...may be returned to the caller, without consuming
90 * any further bytes from the original MBCS sequence.
92 if( pwc != NULL ) *pwc = state.wc[1];
95 /* When the conversion state does not represent a deferred
96 * low surrogate, then restore it, and pass this through as
102 /* In any context, other than the preceding (special) n == 0
103 * case, for retrieval of a deferred low surrogate, a pending
104 * conversion state which represents a surrogate pair is not
105 * a valid state; reject it.
107 if( IS_SURROGATE_PAIR( state.wc[0], state.wc[1] ) )
108 return errout( EINVAL, (size_t)(-1) );
110 /* Step over any pending MBCS bytes, which may already be
111 * present within the conversion state buffer, accumulating
112 * both the count of such pending bytes, together with a
113 * partial count of total bytes for conversion.
115 while( (len < sizeof( mbstate_t )) && (state.mb[len] != '\0') )
119 /* Append MBCS bytes from the input sequence, to the pending
120 * state buffer, up to the specified look-ahead count limit, or
121 * until the filled length of the buffer becomes equivalent to
122 * the effective value of MB_CUR_MAX.
124 while( (len < mbrlen_cur_max) && (count < n) && (s[count] != '\0') )
125 state.mb[len++] = s[count++];
127 /* If the pending look-ahead state has not yet been padded
128 * to the full MB_CUR_MAX length, ensure that it is encoded
129 * as a NUL terminated MBCS sequence, before attempting to
130 * interpret it as a complete MBCS sequence.
132 if( len < mbrlen_cur_max ) state.mb[len] = '\0';
133 if( (int)(count = mbrlen_min( state.mb, len, retval.wc )) > 0 )
135 /* No valid conversion state should ever exist, where no
136 * additional bytes are required to complete a previously
137 * deferred multibyte character.
139 if( pending >= count ) return errout( EILSEQ, (size_t)(-1) );
141 /* The accumulated encoding state does now represent a
142 * complete MBCS sequence; when servicing an mbrtowc() call,
143 * with non-NULL return value pointer, we must store that
147 { /* ...noting that, under MS-Windows, we may not be able
148 * to accommodate the entire converted value in a single
149 * UTF-16 wchar_t, in which case we must return it as a
150 * surrogate pair, of which only the high surrogate can
153 if( IS_HIGH_SURROGATE( *pwc = retval.wc[0] ) )
154 /* ...with the entire pair being stored at the passed
155 * mbstate_t reference buffer, allowing for subsequent
156 * retrieval of the low surrogate.
160 /* In the case that the wchar_t return value represents a
161 * NUL character, ISO-C99 prescribes that, whichever of the
162 * supported functions is being serviced, the returned byte
163 * count, of converted MBCS bytes, must be zero.
165 if( retval.wc[0] == L'\0' ) return (size_t)(0);
167 /* The effective function return value, for this case, is
168 * the count of bytes accumulated into the completed MBCS
169 * byte sequence, discounting those which were deferred
170 * from any preceding call.
172 return (count - pending);
174 else if( count < mbrlen_cur_max )
175 { /* The accumulated encoding state does not represent a
176 * complete, and valid MBCS sequence, but we have not yet
177 * accumulated as many bytes as the effective MB_CUR_MAX
178 * length can accommodate; save the encoding state for
179 * deferred reprocessing, and return the appropriate
180 * pseudo-count to inform the caller that this encoding
181 * state may yet develop into a valid MBCS sequence.
187 /* If neither of the preceding encoding states prevails, then
188 * the current state must represent an invalid MBCS sequence;
189 * report it via errno, and appropriate return value.
191 return errout( EILSEQ, (size_t)(-1) );
194 /* $RCSfile$: end of file */