4 * Implementation of the infrastructure routines to support the mbrlen(),
5 * and mbrtowc() functions, for use in those applications where Microsoft
6 * does not provide adequate support.
10 * Written by Keith Marshall <keith@users.osdn.me>
11 * Copyright (C) 2020, MinGW.org Project
14 * Permission is hereby granted, free of charge, to any person obtaining a
15 * copy of this software and associated documentation files (the "Software"),
16 * to deal in the Software without restriction, including without limitation
17 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
18 * and/or sell copies of the Software, and to permit persons to whom the
19 * Software is furnished to do so, subject to the following conditions:
21 * The above copyright notice, this permission notice, and the following
22 * disclaimer shall be included in all copies or substantial portions of
25 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
26 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
27 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
28 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
29 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
30 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OF OR OTHER
31 * DEALINGS IN THE SOFTWARE.
36 /* We use the MB_LEN_MAX macro, to declare the size of internal
37 * MBCS storage buffers; it is defined in <limits.h>
41 /* The working codeset, and its associated effective MB_CUR_MAX,
42 * are stored with file-scope visibility, to facilitate passing
43 * them to individual elements of the implementation...
45 static __thread unsigned int codeset, mbrlen_cur_max;
47 /* ...but, in this instance, we also need to provide a mechanism
48 * for initializing each of these from the global scope...
50 unsigned int __mingw_mbrtowc_codeset_init( void )
51 { return codeset = __mb_codeset_for_locale(); }
53 unsigned int __mingw_mbrlen_cur_max_init( unsigned int codeset )
54 { return mbrlen_cur_max = __mb_cur_max_for_codeset( codeset ); }
56 /* ...and also, an accessor to make the effective MB_CUR_MAX
57 * available in the global scope.
59 unsigned int __mingw_mbrlen_cur_max( void )
60 { return mbrlen_cur_max; }
62 int __mingw_mbtowc_convert
63 ( const char *s, size_t n, wchar_t *wc, size_t wmax )
65 /* Public helper function to determine if a MBCS sequence, of any
66 * arbitrary length, may be completely converted to a corresponding
67 * wchar_t sequence of specified maximum length, initially subject
68 * to the MB_ERR_INVALID_CHARS flag, and subsequently retrying with
69 * no flags, in the event that conversion with this flag yields an
70 * ERROR_INVALID_FLAGS exception.
72 * A maximum of "n" bytes of the given MBCS sequence, "s", will be
73 * examined, unless a NUL byte is encountered before "n" bytes have
74 * been evaluated; if "n" is specified as zero, it will be ignored,
75 * and the full sequence, assumed to be of unlimited length, will
76 * be examined until a NUL byte is encountered.
78 * Conversion of the given MBCS byte sequence will succeed, only
79 * if it represents a whole number of complete and valid code point
80 * encodings, and the fully converted representation of all of these
81 * code points can be accommodated within "wmax" wchar_t entities;
82 * (if "wmax" is specified as zero, it is treated as unlimited).
84 * If conversion is successful, the return value is the number of
85 * wchar_t entities required to accommodate the fully converted MBCS
86 * sequence; if conversion is unsuccessful, zero is returned.
89 unsigned int flags = MB_ERR_INVALID_CHARS;
90 if( n == 0 ) n = (size_t)(-1);
92 do { SetLastError( 0 );
93 st = MultiByteToWideChar( codeset, flags, s, n, wc, wmax );
94 } while( (st == (flags = 0)) && (GetLastError() == ERROR_INVALID_FLAGS) );
98 size_t __mingw_mbrscan_begin
99 ( wchar_t *restrict *wcs, const char **restrict src, size_t *len,
100 mbstate_t *restrict ps
102 { /* Public helper function, to retrieve, interpret, and complete
103 * conversion state, as passed to any MBCS to wchar_t conversion
104 * routine, via its mbstate_t reference parameter.
106 size_t count = (size_t)(0);
108 /* This becomes a no-op, if there is no pending state data...
110 if( *ps != (mbstate_t)(0) )
111 { /* ...otherwise, we capture, and map the pending state, for
112 * completion and interpretation...
114 union { mbstate_t st; char mb[MB_LEN_MAX]; wchar_t wc[2]; }
117 /* ...and mark the passed mbstate_t as completed.
119 *ps = (mbstate_t)(0);
120 if( IS_SURROGATE_PAIR( state.wc[0], state.wc[1] ) )
121 { /* When the pending state represents a surrogate pair, then
122 * the high surrogate will have been returned previously; it
123 * is the low surrogate which remains pending, and should now
124 * be inserted into the return buffer, if any.
126 if( (wcs != NULL) && (*wcs != NULL) )
127 { *(*wcs)++ = state.wc[1];
128 if( *len > 0 ) --*len;
130 /* In any case, we must account for the low surrogate, which
131 * is represented by this pending state.
136 { /* The pending state represents a previously scanned, but not
137 * yet complete MBCS sequence; we must now add additional bytes,
138 * from the MBCS input sequence, until the pending sequence is
139 * either completed, or can be ruled as invalid.
141 int copy, scan = 0, mark = 0;
143 /* To determine completion state, we need a scratch conversion
144 * buffer which may subsequently be interpreted as mbstate.
146 union { mbstate_t st; wchar_t wc[2]; } buf;
148 /* First, we mark the offset within the pending state buffer,
149 * where the first additional byte should be appended...
151 while( state.mb[mark] != '\0' ) ++mark;
153 { /* ...then we extend this, by appending bytes from the MBCS
154 * input, until we either NUL terminate it, or we reach the
155 * effective maximum MBCS length for a single code point.
157 for( copy = mark; ((*src)[scan] != '\0') && (copy < mbrlen_cur_max); )
158 state.mb[copy++] = (*src)[scan++];
160 /* In the case of NUL termination, the terminating byte has
163 if( copy < mbrlen_cur_max ) state.mb[copy] = '\0';
165 /* Having now captured a potential single code point MBCS
166 * sequence, in the state buffer, we now examine that, in
167 * incremental steps of its initial byte sequence, until
168 * we can successfully convert it, or we must reject it.
170 do { copy = __mingw_mbtowc_convert( state.mb, ++scan, buf.wc, 2 );
171 } while( (copy == 0) && (scan < mbrlen_cur_max) );
173 /* If conversion is unsuccessful...
176 { /* ...and we have extended the sequence to the maximum
177 * length allowed for a single code point, then we must
178 * reject the entire input sequence...
180 if( scan >= mbrlen_cur_max )
181 return errout( EILSEQ, (size_t)(-1) );
183 /* ...otherwise, there is still a possibility that we
184 * may be able to complete this sequence during a later
185 * call, so return it as pending state.
190 /* A successful conversion, which requires more than one
191 * wchar_t, MUST be represented as a surrogate pair; any
192 * other longer representation is invalid.
194 if( (copy > 1) && ! IS_SURROGATE_PAIR( buf.wc[0], buf.wc[1] ) )
195 return errout( EILSEQ, (size_t)(-1) );
197 /* When the representation of a successful conversion is
198 * accepted as valid, and...
200 if( (wcs != NULL) && (*wcs != NULL) )
202 /* ...the caller has provided a buffer, in which to
203 * return it, then we return at least the first wchar
204 * of its representation, and then...
206 *(*wcs)++ = buf.wc[0];
207 if( *len >= (size_t)(copy) )
208 { /* ...when the declared buffer length is sufficient
209 * to accommodate more, and the conversion represents
210 * a surrogate pair, we also return the low surrogate,
211 * and adjust the length to account for it...
213 if( copy > 1 ) *(*wcs)++ = buf.wc[1];
214 *len -= (size_t)(copy);
217 { /* ...otherwise, when we have a surrogate pair to be
218 * returned, but only sufficient buffer to accommodate
219 * the high surrogate, we defer the low surrogate for
220 * return during a subsequent call.
226 /* Increment the return count, to account for each wchar
227 * which has been interpreted, thus far, from the given
230 count += (size_t)(copy);
232 /* Check that we have consumed all content from the given
236 { /* ...or otherwise, discard what we have consumed, and
237 * promote the residual for further consideration.
239 state.mb[mark] = '\0';
240 for( mark = 0; state.mb[mark] != '\0'; ++mark, ++scan )
241 state.mb[mark] = state.mb[scan];
244 /* When all pending state has been consumed, adjust the
245 * input MBCS sequence pointer, to account for any bytes
246 * used to complete that pending state.
248 else *src += (scan - mark);
252 /* Ultimately, return the count of wchar elements, if any, which
253 * result from conversion of pending state.
258 size_t __mingw_mbtowc_copy
259 ( wchar_t *restrict wcs, const char *restrict src, size_t len )
261 /* Public helper function to copy a sequence of one or more wchar_t
262 * elements, which result from conversion of the given MBCS sequence,
263 * either to a caller-provided buffer, (or, if none is provided, use
264 * an internal scratch buffer, to facilitate counting the number of
265 * such elements which would be copied, without storing them).
267 wchar_t scratch[2]; size_t count = (size_t)(0);
271 wchar_t *wc = (wcs == NULL) ? scratch : wcs;
272 do { copy = __mingw_mbtowc_convert( src, ++scan, wc, 2 );
273 } while( (copy == 0) && (scan < mbrlen_cur_max) );
275 if( copy == 0 ) return errout( EILSEQ, (size_t)(-1) );
277 if( *wc == L'\0' ) len = count;
280 if( wcs != NULL ) wcs += copy;
287 /* $RCSfile$: end of file */