4 * Routines for manipulating UTF-8 strings.
6 * Copyright (c) 1997-1998 Sun Microsystems, Inc.
8 * See the file "license.terms" for information on usage and redistribution
9 * of this file, and for a DISCLAIMER OF ALL WARRANTIES.
17 * Include the static character classification tables and macros.
20 #include "tclUniData.c"
23 * The following macros are used for fast character category tests. The
24 * x_BITS values are shifted right by the category value to determine whether
25 * the given category is included in the set.
28 #define ALPHA_BITS ((1 << UPPERCASE_LETTER) | (1 << LOWERCASE_LETTER) \
29 | (1 << TITLECASE_LETTER) | (1 << MODIFIER_LETTER) | (1 << OTHER_LETTER))
31 #define DIGIT_BITS (1 << DECIMAL_DIGIT_NUMBER)
33 #define SPACE_BITS ((1 << SPACE_SEPARATOR) | (1 << LINE_SEPARATOR) \
34 | (1 << PARAGRAPH_SEPARATOR))
36 #define CONNECTOR_BITS (1 << CONNECTOR_PUNCTUATION)
38 #define PRINT_BITS (ALPHA_BITS | DIGIT_BITS | SPACE_BITS | \
39 (1 << NON_SPACING_MARK) | (1 << ENCLOSING_MARK) | \
40 (1 << COMBINING_SPACING_MARK) | (1 << LETTER_NUMBER) | \
41 (1 << OTHER_NUMBER) | (1 << CONNECTOR_PUNCTUATION) | \
42 (1 << DASH_PUNCTUATION) | (1 << OPEN_PUNCTUATION) | \
43 (1 << CLOSE_PUNCTUATION) | (1 << INITIAL_QUOTE_PUNCTUATION) | \
44 (1 << FINAL_QUOTE_PUNCTUATION) | (1 << OTHER_PUNCTUATION) | \
45 (1 << MATH_SYMBOL) | (1 << CURRENCY_SYMBOL) | \
46 (1 << MODIFIER_SYMBOL) | (1 << OTHER_SYMBOL))
48 #define PUNCT_BITS ((1 << CONNECTOR_PUNCTUATION) | \
49 (1 << DASH_PUNCTUATION) | (1 << OPEN_PUNCTUATION) | \
50 (1 << CLOSE_PUNCTUATION) | (1 << INITIAL_QUOTE_PUNCTUATION) | \
51 (1 << FINAL_QUOTE_PUNCTUATION) | (1 << OTHER_PUNCTUATION))
54 * Unicode characters less than this value are represented by themselves
58 #define UNICODE_SELF 0x80
61 * The following structures are used when mapping between Unicode (UCS-2)
65 static CONST unsigned char totalBytes[256] = {
66 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
67 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
68 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
69 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
70 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
71 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
72 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
73 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
92 * Procedures used only in this module.
95 static int UtfCount _ANSI_ARGS_((int ch));
99 *---------------------------------------------------------------------------
103 * Find the number of bytes in the Utf character "ch".
106 * The return values is the number of bytes in the Utf character "ch".
111 *---------------------------------------------------------------------------
116 int ch; /* The Tcl_UniChar whose size is returned. */
118 if ((ch > 0) && (ch < UNICODE_SELF)) {
128 if (ch <= 0x1FFFFF) {
131 if (ch <= 0x3FFFFFF) {
134 if (ch <= 0x7FFFFFFF) {
142 *---------------------------------------------------------------------------
144 * Tcl_UniCharToUtf --
146 * Store the given Tcl_UniChar as a sequence of UTF-8 bytes in the
147 * provided buffer. Equivalent to Plan 9 runetochar().
150 * The return values is the number of bytes in the buffer that
156 *---------------------------------------------------------------------------
160 Tcl_UniCharToUtf(ch, str)
161 int ch; /* The Tcl_UniChar to be stored in the
163 char *str; /* Buffer in which the UTF-8 representation
164 * of the Tcl_UniChar is stored. Buffer must
165 * be large enough to hold the UTF-8 character
166 * (at most TCL_UTF_MAX bytes). */
168 if ((ch > 0) && (ch < UNICODE_SELF)) {
173 str[1] = (char) ((ch | 0x80) & 0xBF);
174 str[0] = (char) ((ch >> 6) | 0xC0);
179 str[2] = (char) ((ch | 0x80) & 0xBF);
180 str[1] = (char) (((ch >> 6) | 0x80) & 0xBF);
181 str[0] = (char) ((ch >> 12) | 0xE0);
186 if (ch <= 0x1FFFFF) {
187 str[3] = (char) ((ch | 0x80) & 0xBF);
188 str[2] = (char) (((ch >> 6) | 0x80) & 0xBF);
189 str[1] = (char) (((ch >> 12) | 0x80) & 0xBF);
190 str[0] = (char) ((ch >> 18) | 0xF0);
193 if (ch <= 0x3FFFFFF) {
194 str[4] = (char) ((ch | 0x80) & 0xBF);
195 str[3] = (char) (((ch >> 6) | 0x80) & 0xBF);
196 str[2] = (char) (((ch >> 12) | 0x80) & 0xBF);
197 str[1] = (char) (((ch >> 18) | 0x80) & 0xBF);
198 str[0] = (char) ((ch >> 24) | 0xF8);
201 if (ch <= 0x7FFFFFFF) {
202 str[5] = (char) ((ch | 0x80) & 0xBF);
203 str[4] = (char) (((ch >> 6) | 0x80) & 0xBF);
204 str[3] = (char) (((ch >> 12) | 0x80) & 0xBF);
205 str[2] = (char) (((ch >> 18) | 0x80) & 0xBF);
206 str[1] = (char) (((ch >> 24) | 0x80) & 0xBF);
207 str[0] = (char) ((ch >> 30) | 0xFC);
217 *---------------------------------------------------------------------------
219 * Tcl_UniCharToUtfDString --
221 * Convert the given Unicode string to UTF-8.
224 * The return value is a pointer to the UTF-8 representation of the
225 * Unicode string. Storage for the return value is appended to the
231 *---------------------------------------------------------------------------
235 Tcl_UniCharToUtfDString(wString, numChars, dsPtr)
236 CONST Tcl_UniChar *wString; /* Unicode string to convert to UTF-8. */
237 int numChars; /* Length of Unicode string in Tcl_UniChars
239 Tcl_DString *dsPtr; /* UTF-8 representation of string is
240 * appended to this previously initialized
243 CONST Tcl_UniChar *w, *wEnd;
248 * UTF-8 string length in bytes will be <= Unicode string length *
252 oldLength = Tcl_DStringLength(dsPtr);
253 Tcl_DStringSetLength(dsPtr, (oldLength + numChars + 1) * TCL_UTF_MAX);
254 string = Tcl_DStringValue(dsPtr) + oldLength;
257 wEnd = wString + numChars;
258 for (w = wString; w < wEnd; ) {
259 p += Tcl_UniCharToUtf(*w, p);
262 Tcl_DStringSetLength(dsPtr, oldLength + (p - string));
268 *---------------------------------------------------------------------------
270 * Tcl_UtfToUniChar --
272 * Extract the Tcl_UniChar represented by the UTF-8 string. Bad
273 * UTF-8 sequences are converted to valid Tcl_UniChars and processing
274 * continues. Equivalent to Plan 9 chartorune().
276 * The caller must ensure that the source buffer is long enough that
277 * this routine does not run off the end and dereference non-existent
278 * memory looking for trail bytes. If the source buffer is known to
279 * be '\0' terminated, this cannot happen. Otherwise, the caller
280 * should call Tcl_UtfCharComplete() before calling this routine to
281 * ensure that enough bytes remain in the string.
284 * *chPtr is filled with the Tcl_UniChar, and the return value is the
285 * number of bytes from the UTF-8 string that were consumed.
290 *---------------------------------------------------------------------------
294 Tcl_UtfToUniChar(str, chPtr)
295 register CONST char *str; /* The UTF-8 string. */
296 register Tcl_UniChar *chPtr; /* Filled with the Tcl_UniChar represented
297 * by the UTF-8 string. */
302 * Unroll 1 to 3 byte UTF-8 sequences, use loop to handle longer ones.
305 byte = *((unsigned char *) str);
308 * Handles properly formed UTF-8 characters between 0x01 and 0x7F.
309 * Also treats \0 and naked trail bytes 0x80 to 0xBF as valid
310 * characters representing themselves.
313 *chPtr = (Tcl_UniChar) byte;
315 } else if (byte < 0xE0) {
316 if ((str[1] & 0xC0) == 0x80) {
318 * Two-byte-character lead-byte followed by a trail-byte.
321 *chPtr = (Tcl_UniChar) (((byte & 0x1F) << 6) | (str[1] & 0x3F));
325 * A two-byte-character lead-byte not followed by trail-byte
329 *chPtr = (Tcl_UniChar) byte;
331 } else if (byte < 0xF0) {
332 if (((str[1] & 0xC0) == 0x80) && ((str[2] & 0xC0) == 0x80)) {
334 * Three-byte-character lead byte followed by two trail bytes.
337 *chPtr = (Tcl_UniChar) (((byte & 0x0F) << 12)
338 | ((str[1] & 0x3F) << 6) | (str[2] & 0x3F));
342 * A three-byte-character lead-byte not followed by two trail-bytes
346 *chPtr = (Tcl_UniChar) byte;
351 int ch, total, trail;
353 total = totalBytes[byte];
356 ch = byte & (0x3F >> trail);
359 if ((*str & 0xC0) != 0x80) {
373 *chPtr = (Tcl_UniChar) byte;
378 *---------------------------------------------------------------------------
380 * Tcl_UtfToUniCharDString --
382 * Convert the UTF-8 string to Unicode.
385 * The return value is a pointer to the Unicode representation of the
386 * UTF-8 string. Storage for the return value is appended to the
387 * end of dsPtr. The Unicode string is terminated with a Unicode
393 *---------------------------------------------------------------------------
397 Tcl_UtfToUniCharDString(string, length, dsPtr)
398 CONST char *string; /* UTF-8 string to convert to Unicode. */
399 int length; /* Length of UTF-8 string in bytes, or -1
401 Tcl_DString *dsPtr; /* Unicode representation of string is
402 * appended to this previously initialized
405 Tcl_UniChar *w, *wString;
410 length = strlen(string);
414 * Unicode string length in Tcl_UniChars will be <= UTF-8 string length
418 oldLength = Tcl_DStringLength(dsPtr);
419 Tcl_DStringSetLength(dsPtr,
420 (int) ((oldLength + length + 1) * sizeof(Tcl_UniChar)));
421 wString = (Tcl_UniChar *) (Tcl_DStringValue(dsPtr) + oldLength);
424 end = string + length;
425 for (p = string; p < end; ) {
426 p += Tcl_UtfToUniChar(p, w);
430 Tcl_DStringSetLength(dsPtr,
431 (oldLength + ((char *) w - (char *) wString)));
437 *---------------------------------------------------------------------------
439 * Tcl_UtfCharComplete --
441 * Determine if the UTF-8 string of the given length is long enough
442 * to be decoded by Tcl_UtfToUniChar(). This does not ensure that the
443 * UTF-8 string is properly formed. Equivalent to Plan 9 fullrune().
446 * The return value is 0 if the string is not long enough, non-zero
452 *---------------------------------------------------------------------------
456 Tcl_UtfCharComplete(str, len)
457 CONST char *str; /* String to check if first few bytes
458 * contain a complete UTF-8 character. */
459 int len; /* Length of above string in bytes. */
463 ch = *((unsigned char *) str);
464 return len >= totalBytes[ch];
468 *---------------------------------------------------------------------------
472 * Returns the number of characters (not bytes) in the UTF-8 string,
473 * not including the terminating NULL byte. This is equivalent to
474 * Plan 9 utflen() and utfnlen().
482 *---------------------------------------------------------------------------
486 Tcl_NumUtfChars(str, len)
487 register CONST char *str; /* The UTF-8 string to measure. */
488 int len; /* The length of the string in bytes, or -1
489 * for strlen(string). */
492 register Tcl_UniChar *chPtr = &ch;
497 * The separate implementations are faster.
503 str += Tcl_UtfToUniChar(str, chPtr);
511 n = Tcl_UtfToUniChar(str, chPtr);
521 *---------------------------------------------------------------------------
523 * Tcl_UtfFindFirst --
525 * Returns a pointer to the first occurance of the given Tcl_UniChar
526 * in the NULL-terminated UTF-8 string. The NULL terminator is
527 * considered part of the UTF-8 string. Equivalent to Plan 9
531 * As above. If the Tcl_UniChar does not exist in the given string,
532 * the return value is NULL.
537 *---------------------------------------------------------------------------
540 Tcl_UtfFindFirst(string, ch)
541 CONST char *string; /* The UTF-8 string to be searched. */
542 int ch; /* The Tcl_UniChar to search for. */
548 len = Tcl_UtfToUniChar(string, &find);
552 if (*string == '\0') {
560 *---------------------------------------------------------------------------
564 * Returns a pointer to the last occurance of the given Tcl_UniChar
565 * in the NULL-terminated UTF-8 string. The NULL terminator is
566 * considered part of the UTF-8 string. Equivalent to Plan 9
570 * As above. If the Tcl_UniChar does not exist in the given string,
571 * the return value is NULL.
576 *---------------------------------------------------------------------------
580 Tcl_UtfFindLast(string, ch)
581 CONST char *string; /* The UTF-8 string to be searched. */
582 int ch; /* The Tcl_UniChar to search for. */
590 len = Tcl_UtfToUniChar(string, &find);
594 if (*string == '\0') {
603 *---------------------------------------------------------------------------
607 * Given a pointer to some current location in a UTF-8 string,
608 * move forward one character. The caller must ensure that they
609 * are not asking for the next character after the last character
613 * The return value is the pointer to the next character in
619 *---------------------------------------------------------------------------
624 CONST char *str; /* The current location in the string. */
628 return str + Tcl_UtfToUniChar(str, &ch);
632 *---------------------------------------------------------------------------
636 * Given a pointer to some current location in a UTF-8 string,
637 * move backwards one character. This works correctly when the
638 * pointer is in the middle of a UTF-8 character.
641 * The return value is a pointer to the previous character in the
642 * UTF-8 string. If the current location was already at the
643 * beginning of the string, the return value will also be a
644 * pointer to the beginning of the string.
649 *---------------------------------------------------------------------------
653 Tcl_UtfPrev(str, start)
654 CONST char *str; /* The current location in the string. */
655 CONST char *start; /* Pointer to the beginning of the
656 * string, to avoid going backwards too
664 for (i = 0; i < TCL_UTF_MAX; i++) {
671 byte = *((unsigned char *) look);
684 *---------------------------------------------------------------------------
686 * Tcl_UniCharAtIndex --
688 * Returns the Unicode character represented at the specified
689 * character (not byte) position in the UTF-8 string.
697 *---------------------------------------------------------------------------
701 Tcl_UniCharAtIndex(src, index)
702 register CONST char *src; /* The UTF-8 string to dereference. */
703 register int index; /* The position of the desired character. */
709 src += Tcl_UtfToUniChar(src, &ch);
715 *---------------------------------------------------------------------------
719 * Returns a pointer to the specified character (not byte) position
720 * in the UTF-8 string.
728 *---------------------------------------------------------------------------
732 Tcl_UtfAtIndex(src, index)
733 register CONST char *src; /* The UTF-8 string. */
734 register int index; /* The position of the desired character. */
740 src += Tcl_UtfToUniChar(src, &ch);
746 *---------------------------------------------------------------------------
748 * Tcl_UtfBackslash --
750 * Figure out how to handle a backslash sequence.
753 * Stores the bytes represented by the backslash sequence in dst and
754 * returns the number of bytes written to dst. At most TCL_UTF_MAX
755 * bytes are written to dst; dst must have been large enough to accept
756 * those bytes. If readPtr isn't NULL then it is filled in with a
757 * count of the number of bytes in the backslash sequence.
760 * The maximum number of bytes it takes to represent a Unicode
761 * character in UTF-8 is guaranteed to be less than the number of
762 * bytes used to express the backslash sequence that represents
763 * that Unicode character. If the target buffer into which the
764 * caller is going to store the bytes that represent the Unicode
765 * character is at least as large as the source buffer from which
766 * the backslashed sequence was extracted, no buffer overruns should
769 *---------------------------------------------------------------------------
773 Tcl_UtfBackslash(src, readPtr, dst)
774 CONST char *src; /* Points to the backslash character of
775 * a backslash sequence. */
776 int *readPtr; /* Fill in with number of characters read
777 * from src, unless NULL. */
778 char *dst; /* Filled with the bytes represented by the
779 * backslash sequence. */
781 #define LINE_LENGTH 128
785 result = TclParseBackslash(src, LINE_LENGTH, &numRead, dst);
786 if (numRead == LINE_LENGTH) {
787 /* We ate a whole line. Pay the price of a strlen() */
788 result = TclParseBackslash(src, (int)strlen(src), &numRead, dst);
790 if (readPtr != NULL) {
797 *----------------------------------------------------------------------
801 * Convert lowercase characters to uppercase characters in a UTF
802 * string in place. The conversion may shrink the UTF string.
805 * Returns the number of bytes in the resulting string
806 * excluding the trailing null.
809 * Writes a terminating null after the last converted character.
811 *----------------------------------------------------------------------
816 char *str; /* String to convert in place. */
818 Tcl_UniChar ch, upChar;
823 * Iterate over the string until we hit the terminating null.
828 bytes = Tcl_UtfToUniChar(src, &ch);
829 upChar = Tcl_UniCharToUpper(ch);
832 * To keep badly formed Utf strings from getting inflated by
833 * the conversion (thereby causing a segfault), only copy the
834 * upper case char to dst if its size is <= the original char.
837 if (bytes < UtfCount(upChar)) {
838 memcpy(dst, src, (size_t) bytes);
841 dst += Tcl_UniCharToUtf(upChar, dst);
850 *----------------------------------------------------------------------
854 * Convert uppercase characters to lowercase characters in a UTF
855 * string in place. The conversion may shrink the UTF string.
858 * Returns the number of bytes in the resulting string
859 * excluding the trailing null.
862 * Writes a terminating null after the last converted character.
864 *----------------------------------------------------------------------
869 char *str; /* String to convert in place. */
871 Tcl_UniChar ch, lowChar;
876 * Iterate over the string until we hit the terminating null.
881 bytes = Tcl_UtfToUniChar(src, &ch);
882 lowChar = Tcl_UniCharToLower(ch);
885 * To keep badly formed Utf strings from getting inflated by
886 * the conversion (thereby causing a segfault), only copy the
887 * lower case char to dst if its size is <= the original char.
890 if (bytes < UtfCount(lowChar)) {
891 memcpy(dst, src, (size_t) bytes);
894 dst += Tcl_UniCharToUtf(lowChar, dst);
903 *----------------------------------------------------------------------
907 * Changes the first character of a UTF string to title case or
908 * uppercase and the rest of the string to lowercase. The
909 * conversion happens in place and may shrink the UTF string.
912 * Returns the number of bytes in the resulting string
913 * excluding the trailing null.
916 * Writes a terminating null after the last converted character.
918 *----------------------------------------------------------------------
923 char *str; /* String to convert in place. */
925 Tcl_UniChar ch, titleChar, lowChar;
930 * Capitalize the first character and then lowercase the rest of the
931 * characters until we get to a null.
937 bytes = Tcl_UtfToUniChar(src, &ch);
938 titleChar = Tcl_UniCharToTitle(ch);
940 if (bytes < UtfCount(titleChar)) {
941 memcpy(dst, src, (size_t) bytes);
944 dst += Tcl_UniCharToUtf(titleChar, dst);
949 bytes = Tcl_UtfToUniChar(src, &ch);
950 lowChar = Tcl_UniCharToLower(ch);
952 if (bytes < UtfCount(lowChar)) {
953 memcpy(dst, src, (size_t) bytes);
956 dst += Tcl_UniCharToUtf(lowChar, dst);
965 *----------------------------------------------------------------------
969 * Compare at most n bytes of utf-8 strings cs and ct. Both cs
970 * and ct are assumed to be at least n bytes long.
973 * Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct.
978 *----------------------------------------------------------------------
982 TclpUtfNcmp2(cs, ct, n)
983 CONST char *cs; /* UTF string to compare to ct. */
984 CONST char *ct; /* UTF string cs is compared to. */
985 unsigned long n; /* Number of *bytes* to compare. */
988 * We can't simply call 'memcmp(cs, ct, n);' because we need to check
989 * for Tcl's \xC0\x80 non-utf-8 null encoding.
990 * Otherwise utf-8 lexes fine in the strcmp manner.
992 register int result = 0;
994 for ( ; n != 0; n--, cs++, ct++) {
996 result = UCHAR(*cs) - UCHAR(*ct);
1000 if (n && ((UCHAR(*cs) == 0xC0) || (UCHAR(*ct) == 0xC0))) {
1001 unsigned char c1, c2;
1002 c1 = ((UCHAR(*cs) == 0xC0) && (UCHAR(cs[1]) == 0x80)) ? 0 : UCHAR(*cs);
1003 c2 = ((UCHAR(*ct) == 0xC0) && (UCHAR(ct[1]) == 0x80)) ? 0 : UCHAR(*ct);
1010 *----------------------------------------------------------------------
1014 * Compare at most n UTF chars of string cs to string ct. Both cs
1015 * and ct are assumed to be at least n UTF chars long.
1018 * Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct.
1023 *----------------------------------------------------------------------
1027 Tcl_UtfNcmp(cs, ct, n)
1028 CONST char *cs; /* UTF string to compare to ct. */
1029 CONST char *ct; /* UTF string cs is compared to. */
1030 unsigned long n; /* Number of UTF chars to compare. */
1032 Tcl_UniChar ch1, ch2;
1034 * Cannot use 'memcmp(cs, ct, n);' as byte representation of
1035 * \u0000 (the pair of bytes 0xc0,0x80) is larger than byte
1036 * representation of \u0001 (the byte 0x01.)
1040 * n must be interpreted as chars, not bytes.
1041 * This should be called only when both strings are of
1042 * at least n chars long (no need for \0 check)
1044 cs += Tcl_UtfToUniChar(cs, &ch1);
1045 ct += Tcl_UtfToUniChar(ct, &ch2);
1054 *----------------------------------------------------------------------
1056 * Tcl_UtfNcasecmp --
1058 * Compare at most n UTF chars of string cs to string ct case
1059 * insensitive. Both cs and ct are assumed to be at least n
1063 * Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct.
1068 *----------------------------------------------------------------------
1072 Tcl_UtfNcasecmp(cs, ct, n)
1073 CONST char *cs; /* UTF string to compare to ct. */
1074 CONST char *ct; /* UTF string cs is compared to. */
1075 unsigned long n; /* Number of UTF chars to compare. */
1077 Tcl_UniChar ch1, ch2;
1080 * n must be interpreted as chars, not bytes.
1081 * This should be called only when both strings are of
1082 * at least n chars long (no need for \0 check)
1084 cs += Tcl_UtfToUniChar(cs, &ch1);
1085 ct += Tcl_UtfToUniChar(ct, &ch2);
1087 ch1 = Tcl_UniCharToLower(ch1);
1088 ch2 = Tcl_UniCharToLower(ch2);
1098 *----------------------------------------------------------------------
1100 * Tcl_UniCharToUpper --
1102 * Compute the uppercase equivalent of the given Unicode character.
1105 * Returns the uppercase Unicode character.
1110 *----------------------------------------------------------------------
1114 Tcl_UniCharToUpper(ch)
1115 int ch; /* Unicode character to convert. */
1117 int info = GetUniCharInfo(ch);
1119 if (GetCaseType(info) & 0x04) {
1120 return (Tcl_UniChar) (ch - GetDelta(info));
1127 *----------------------------------------------------------------------
1129 * Tcl_UniCharToLower --
1131 * Compute the lowercase equivalent of the given Unicode character.
1134 * Returns the lowercase Unicode character.
1139 *----------------------------------------------------------------------
1143 Tcl_UniCharToLower(ch)
1144 int ch; /* Unicode character to convert. */
1146 int info = GetUniCharInfo(ch);
1148 if (GetCaseType(info) & 0x02) {
1149 return (Tcl_UniChar) (ch + GetDelta(info));
1156 *----------------------------------------------------------------------
1158 * Tcl_UniCharToTitle --
1160 * Compute the titlecase equivalent of the given Unicode character.
1163 * Returns the titlecase Unicode character.
1168 *----------------------------------------------------------------------
1172 Tcl_UniCharToTitle(ch)
1173 int ch; /* Unicode character to convert. */
1175 int info = GetUniCharInfo(ch);
1176 int mode = GetCaseType(info);
1180 * Subtract or add one depending on the original case.
1183 return (Tcl_UniChar) (ch + ((mode & 0x4) ? -1 : 1));
1184 } else if (mode == 0x4) {
1185 return (Tcl_UniChar) (ch - GetDelta(info));
1192 *----------------------------------------------------------------------
1196 * Find the length of a UniChar string. The str input must be null
1200 * Returns the length of str in UniChars (not bytes).
1205 *----------------------------------------------------------------------
1210 CONST Tcl_UniChar *str; /* Unicode string to find length of. */
1214 while (*str != '\0') {
1222 *----------------------------------------------------------------------
1224 * Tcl_UniCharNcmp --
1226 * Compare at most n unichars of string cs to string ct. Both cs
1227 * and ct are assumed to be at least n unichars long.
1230 * Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct.
1235 *----------------------------------------------------------------------
1239 Tcl_UniCharNcmp(cs, ct, n)
1240 CONST Tcl_UniChar *cs; /* Unicode string to compare to ct. */
1241 CONST Tcl_UniChar *ct; /* Unicode string cs is compared to. */
1242 unsigned long n; /* Number of unichars to compare. */
1244 #ifdef WORDS_BIGENDIAN
1246 * We are definitely on a big-endian machine; memcmp() is safe
1248 return memcmp(cs, ct, n*sizeof(Tcl_UniChar));
1250 #else /* !WORDS_BIGENDIAN */
1252 * We can't simply call memcmp() because that is not lexically correct.
1254 for ( ; n != 0; cs++, ct++, n--) {
1260 #endif /* WORDS_BIGENDIAN */
1264 *----------------------------------------------------------------------
1266 * Tcl_UniCharNcasecmp --
1268 * Compare at most n unichars of string cs to string ct case
1269 * insensitive. Both cs and ct are assumed to be at least n
1273 * Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct.
1278 *----------------------------------------------------------------------
1282 Tcl_UniCharNcasecmp(cs, ct, n)
1283 CONST Tcl_UniChar *cs; /* Unicode string to compare to ct. */
1284 CONST Tcl_UniChar *ct; /* Unicode string cs is compared to. */
1285 unsigned long n; /* Number of unichars to compare. */
1287 for ( ; n != 0; n--, cs++, ct++) {
1289 (Tcl_UniCharToLower(*cs) != Tcl_UniCharToLower(*ct))) {
1297 *----------------------------------------------------------------------
1299 * Tcl_UniCharIsAlnum --
1301 * Test if a character is an alphanumeric Unicode character.
1304 * Returns 1 if character is alphanumeric.
1309 *----------------------------------------------------------------------
1313 Tcl_UniCharIsAlnum(ch)
1314 int ch; /* Unicode character to test. */
1316 register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
1318 return (((ALPHA_BITS | DIGIT_BITS) >> category) & 1);
1322 *----------------------------------------------------------------------
1324 * Tcl_UniCharIsAlpha --
1326 * Test if a character is an alphabetic Unicode character.
1329 * Returns 1 if character is alphabetic.
1334 *----------------------------------------------------------------------
1338 Tcl_UniCharIsAlpha(ch)
1339 int ch; /* Unicode character to test. */
1341 register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
1342 return ((ALPHA_BITS >> category) & 1);
1346 *----------------------------------------------------------------------
1348 * Tcl_UniCharIsControl --
1350 * Test if a character is a Unicode control character.
1353 * Returns non-zero if character is a control.
1358 *----------------------------------------------------------------------
1362 Tcl_UniCharIsControl(ch)
1363 int ch; /* Unicode character to test. */
1365 return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == CONTROL);
1369 *----------------------------------------------------------------------
1371 * Tcl_UniCharIsDigit --
1373 * Test if a character is a numeric Unicode character.
1376 * Returns non-zero if character is a digit.
1381 *----------------------------------------------------------------------
1385 Tcl_UniCharIsDigit(ch)
1386 int ch; /* Unicode character to test. */
1388 return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK)
1389 == DECIMAL_DIGIT_NUMBER);
1393 *----------------------------------------------------------------------
1395 * Tcl_UniCharIsGraph --
1397 * Test if a character is any Unicode print character except space.
1400 * Returns non-zero if character is printable, but not space.
1405 *----------------------------------------------------------------------
1409 Tcl_UniCharIsGraph(ch)
1410 int ch; /* Unicode character to test. */
1412 register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
1413 return (((PRINT_BITS >> category) & 1) && ((unsigned char) ch != ' '));
1417 *----------------------------------------------------------------------
1419 * Tcl_UniCharIsLower --
1421 * Test if a character is a lowercase Unicode character.
1424 * Returns non-zero if character is lowercase.
1429 *----------------------------------------------------------------------
1433 Tcl_UniCharIsLower(ch)
1434 int ch; /* Unicode character to test. */
1436 return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == LOWERCASE_LETTER);
1440 *----------------------------------------------------------------------
1442 * Tcl_UniCharIsPrint --
1444 * Test if a character is a Unicode print character.
1447 * Returns non-zero if character is printable.
1452 *----------------------------------------------------------------------
1456 Tcl_UniCharIsPrint(ch)
1457 int ch; /* Unicode character to test. */
1459 register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
1460 return ((PRINT_BITS >> category) & 1);
1464 *----------------------------------------------------------------------
1466 * Tcl_UniCharIsPunct --
1468 * Test if a character is a Unicode punctuation character.
1471 * Returns non-zero if character is punct.
1476 *----------------------------------------------------------------------
1480 Tcl_UniCharIsPunct(ch)
1481 int ch; /* Unicode character to test. */
1483 register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
1484 return ((PUNCT_BITS >> category) & 1);
1488 *----------------------------------------------------------------------
1490 * Tcl_UniCharIsSpace --
1492 * Test if a character is a whitespace Unicode character.
1495 * Returns non-zero if character is a space.
1500 *----------------------------------------------------------------------
1504 Tcl_UniCharIsSpace(ch)
1505 int ch; /* Unicode character to test. */
1507 register int category;
1510 * If the character is within the first 127 characters, just use the
1511 * standard C function, otherwise consult the Unicode table.
1515 return isspace(UCHAR(ch)); /* INTL: ISO space */
1517 category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
1518 return ((SPACE_BITS >> category) & 1);
1523 *----------------------------------------------------------------------
1525 * Tcl_UniCharIsUpper --
1527 * Test if a character is a uppercase Unicode character.
1530 * Returns non-zero if character is uppercase.
1535 *----------------------------------------------------------------------
1539 Tcl_UniCharIsUpper(ch)
1540 int ch; /* Unicode character to test. */
1542 return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == UPPERCASE_LETTER);
1546 *----------------------------------------------------------------------
1548 * Tcl_UniCharIsWordChar --
1550 * Test if a character is alphanumeric or a connector punctuation
1554 * Returns 1 if character is a word character.
1559 *----------------------------------------------------------------------
1563 Tcl_UniCharIsWordChar(ch)
1564 int ch; /* Unicode character to test. */
1566 register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
1568 return (((ALPHA_BITS | DIGIT_BITS | CONNECTOR_BITS) >> category) & 1);
1572 *----------------------------------------------------------------------
1574 * Tcl_UniCharCaseMatch --
1576 * See if a particular Unicode string matches a particular pattern.
1577 * Allows case insensitivity. This is the Unicode equivalent of
1578 * the char* Tcl_StringCaseMatch.
1581 * The return value is 1 if string matches pattern, and
1582 * 0 otherwise. The matching operation permits the following
1583 * special characters in the pattern: *?\[] (see the manual
1584 * entry for details on what these mean).
1589 *----------------------------------------------------------------------
1593 Tcl_UniCharCaseMatch(string, pattern, nocase)
1594 CONST Tcl_UniChar *string; /* Unicode String. */
1595 CONST Tcl_UniChar *pattern; /* Pattern, which may contain special
1597 int nocase; /* 0 for case sensitive, 1 for insensitive */
1605 * See if we're at the end of both the pattern and the string. If
1606 * so, we succeeded. If we're at the end of the pattern but not at
1607 * the end of the string, we failed.
1611 return (*string == 0);
1613 if ((*string == 0) && (p != '*')) {
1618 * Check for a "*" as the next pattern character. It matches any
1619 * substring. We handle this by skipping all the characters up to the
1620 * next matching one in the pattern, and then calling ourselves
1621 * recursively for each postfix of string, until either we match or we
1622 * reach the end of the string.
1627 * Skip all successive *'s in the pattern
1629 while (*(++pattern) == '*') {}
1635 p = Tcl_UniCharToLower(p);
1639 * Optimization for matching - cruise through the string
1640 * quickly if the next char in the pattern isn't a special
1643 if ((p != '[') && (p != '?') && (p != '\\')) {
1645 while (*string && (p != *string)
1646 && (p != Tcl_UniCharToLower(*string))) {
1650 while (*string && (p != *string)) { string++; }
1653 if (Tcl_UniCharCaseMatch(string, pattern, nocase)) {
1664 * Check for a "?" as the next pattern character. It matches
1665 * any single character.
1675 * Check for a "[" as the next pattern character. It is followed
1676 * by a list of characters that are acceptable, or by a range
1677 * (two characters separated by "-").
1681 Tcl_UniChar startChar, endChar;
1684 ch1 = (nocase ? Tcl_UniCharToLower(*string) : *string);
1687 if ((*pattern == ']') || (*pattern == 0)) {
1690 startChar = (nocase ? Tcl_UniCharToLower(*pattern) : *pattern);
1692 if (*pattern == '-') {
1694 if (*pattern == 0) {
1697 endChar = (nocase ? Tcl_UniCharToLower(*pattern)
1700 if (((startChar <= ch1) && (ch1 <= endChar))
1701 || ((endChar <= ch1) && (ch1 <= startChar))) {
1703 * Matches ranges of form [a-z] or [z-a].
1707 } else if (startChar == ch1) {
1711 while (*pattern != ']') {
1712 if (*pattern == 0) {
1723 * If the next pattern character is '\', just strip off the '\'
1724 * so we do exact matching on the character that follows.
1728 if (*(++pattern) == '\0') {
1734 * There's no special character. Just make sure that the next
1735 * bytes of each string match.
1739 if (Tcl_UniCharToLower(*string) != Tcl_UniCharToLower(*pattern)) {
1742 } else if (*string != *pattern) {