tcl/generic/tclUtf.c

   1 /*
   2  * tclUtf.c --
   3  *
   4  *      Routines for manipulating UTF-8 strings.
   5  *
   6  * Copyright (c) 1997-1998 Sun Microsystems, Inc.
   7  *
   8  * See the file "license.terms" for information on usage and redistribution
   9  * of this file, and for a DISCLAIMER OF ALL WARRANTIES.
  10  *
  11  * RCS: @(#) $Id$
  12  */
  13
  14 #include "tclInt.h"
  15
  16 /*
  17  * Include the static character classification tables and macros.
  18  */
  19
  20 #include "tclUniData.c"
  21
  22 /*
  23  * The following macros are used for fast character category tests.  The
  24  * x_BITS values are shifted right by the category value to determine whether
  25  * the given category is included in the set.
  26  */
  27
  28 #define ALPHA_BITS ((1 << UPPERCASE_LETTER) | (1 << LOWERCASE_LETTER) \
  29     | (1 << TITLECASE_LETTER) | (1 << MODIFIER_LETTER) | (1 << OTHER_LETTER))
  30
  31 #define DIGIT_BITS (1 << DECIMAL_DIGIT_NUMBER)
  32
  33 #define SPACE_BITS ((1 << SPACE_SEPARATOR) | (1 << LINE_SEPARATOR) \
  34     | (1 << PARAGRAPH_SEPARATOR))
  35
  36 #define CONNECTOR_BITS (1 << CONNECTOR_PUNCTUATION)
  37
  38 #define PRINT_BITS (ALPHA_BITS | DIGIT_BITS | SPACE_BITS | \
  39             (1 << NON_SPACING_MARK) | (1 << ENCLOSING_MARK) | \
  40             (1 << COMBINING_SPACING_MARK) | (1 << LETTER_NUMBER) | \
  41             (1 << OTHER_NUMBER) | (1 << CONNECTOR_PUNCTUATION) | \
  42             (1 << DASH_PUNCTUATION) | (1 << OPEN_PUNCTUATION) | \
  43             (1 << CLOSE_PUNCTUATION) | (1 << INITIAL_QUOTE_PUNCTUATION) | \
  44             (1 << FINAL_QUOTE_PUNCTUATION) | (1 << OTHER_PUNCTUATION) | \
  45             (1 << MATH_SYMBOL) | (1 << CURRENCY_SYMBOL) | \
  46             (1 << MODIFIER_SYMBOL) | (1 << OTHER_SYMBOL))
  47
  48 #define PUNCT_BITS ((1 << CONNECTOR_PUNCTUATION) | \
  49             (1 << DASH_PUNCTUATION) | (1 << OPEN_PUNCTUATION) | \
  50             (1 << CLOSE_PUNCTUATION) | (1 << INITIAL_QUOTE_PUNCTUATION) | \
  51             (1 << FINAL_QUOTE_PUNCTUATION) | (1 << OTHER_PUNCTUATION))
  52
  53 /*
  54  * Unicode characters less than this value are represented by themselves
  55  * in UTF-8 strings.
  56  */
  57
  58 #define UNICODE_SELF    0x80
  59
  60 /*
  61  * The following structures are used when mapping between Unicode (UCS-2)
  62  * and UTF-8.
  63  */
  64
  65 static CONST unsigned char totalBytes[256] = {
  66     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  67     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  68     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  69     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  70     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  71     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  72     2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
  73     3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
  74 #if TCL_UTF_MAX > 3
  75     4,4,4,4,4,4,4,4,
  76 #else
  77     1,1,1,1,1,1,1,1,
  78 #endif
  79 #if TCL_UTF_MAX > 4
  80     5,5,5,5,
  81 #else
  82     1,1,1,1,
  83 #endif
  84 #if TCL_UTF_MAX > 5
  85     6,6,6,6
  86 #else
  87     1,1,1,1
  88 #endif
  89 };
  90
  91 /*
  92  * Procedures used only in this module.
  93  */
  94
  95 static int UtfCount _ANSI_ARGS_((int ch));
  96
  97 \f
  98 /*
  99  *---------------------------------------------------------------------------
 100  *
 101  * UtfCount --
 102  *
 103  *      Find the number of bytes in the Utf character "ch".
 104  *
 105  * Results:
 106  *      The return values is the number of bytes in the Utf character "ch".
 107  *
 108  * Side effects:
 109  *      None.
 110  *
 111  *---------------------------------------------------------------------------
 112  */
 113
 114 INLINE static int
 115 UtfCount(ch)
 116     int ch;                     /* The Tcl_UniChar whose size is returned. */
 117 {
 118     if ((ch > 0) && (ch < UNICODE_SELF)) {
 119         return 1;
 120     }
 121     if (ch <= 0x7FF) {
 122         return 2;
 123     }
 124     if (ch <= 0xFFFF) {
 125         return 3;
 126     }
 127 #if TCL_UTF_MAX > 3
 128     if (ch <= 0x1FFFFF) {
 129         return 4;
 130     }
 131     if (ch <= 0x3FFFFFF) {
 132         return 5;
 133     }
 134     if (ch <= 0x7FFFFFFF) {
 135         return 6;
 136     }
 137 #endif
 138     return 3;
 139 }
 140
 141 /*
 142  *---------------------------------------------------------------------------
 143  *
 144  * Tcl_UniCharToUtf --
 145  *
 146  *      Store the given Tcl_UniChar as a sequence of UTF-8 bytes in the
 147  *      provided buffer.  Equivalent to Plan 9 runetochar().
 148  *
 149  * Results:
 150  *      The return values is the number of bytes in the buffer that
 151  *      were consumed.
 152  *
 153  * Side effects:
 154  *      None.
 155  *
 156  *---------------------------------------------------------------------------
 157  */
 158
 159 INLINE int
 160 Tcl_UniCharToUtf(ch, str)
 161     int ch;                     /* The Tcl_UniChar to be stored in the
 162                                  * buffer. */
 163     char *str;                  /* Buffer in which the UTF-8 representation
 164                                  * of the Tcl_UniChar is stored.  Buffer must
 165                                  * be large enough to hold the UTF-8 character
 166                                  * (at most TCL_UTF_MAX bytes). */
 167 {
 168     if ((ch > 0) && (ch < UNICODE_SELF)) {
 169         str[0] = (char) ch;
 170         return 1;
 171     }
 172     if (ch <= 0x7FF) {
 173         str[1] = (char) ((ch | 0x80) & 0xBF);
 174         str[0] = (char) ((ch >> 6) | 0xC0);
 175         return 2;
 176     }
 177     if (ch <= 0xFFFF) {
 178         three:
 179         str[2] = (char) ((ch | 0x80) & 0xBF);
 180         str[1] = (char) (((ch >> 6) | 0x80) & 0xBF);
 181         str[0] = (char) ((ch >> 12) | 0xE0);
 182         return 3;
 183     }
 184
 185 #if TCL_UTF_MAX > 3
 186     if (ch <= 0x1FFFFF) {
 187         str[3] = (char) ((ch | 0x80) & 0xBF);
 188         str[2] = (char) (((ch >> 6) | 0x80) & 0xBF);
 189         str[1] = (char) (((ch >> 12) | 0x80) & 0xBF);
 190         str[0] = (char) ((ch >> 18) | 0xF0);
 191         return 4;
 192     }
 193     if (ch <= 0x3FFFFFF) {
 194         str[4] = (char) ((ch | 0x80) & 0xBF);
 195         str[3] = (char) (((ch >> 6) | 0x80) & 0xBF);
 196         str[2] = (char) (((ch >> 12) | 0x80) & 0xBF);
 197         str[1] = (char) (((ch >> 18) | 0x80) & 0xBF);
 198         str[0] = (char) ((ch >> 24) | 0xF8);
 199         return 5;
 200     }
 201     if (ch <= 0x7FFFFFFF) {
 202         str[5] = (char) ((ch | 0x80) & 0xBF);
 203         str[4] = (char) (((ch >> 6) | 0x80) & 0xBF);
 204         str[3] = (char) (((ch >> 12) | 0x80) & 0xBF);
 205         str[2] = (char) (((ch >> 18) | 0x80) & 0xBF);
 206         str[1] = (char) (((ch >> 24) | 0x80) & 0xBF);
 207         str[0] = (char) ((ch >> 30) | 0xFC);
 208         return 6;
 209     }
 210 #endif
 211
 212     ch = 0xFFFD;
 213     goto three;
 214 }
 215 \f
 216 /*
 217  *---------------------------------------------------------------------------
 218  *
 219  * Tcl_UniCharToUtfDString --
 220  *
 221  *      Convert the given Unicode string to UTF-8.
 222  *
 223  * Results:
 224  *      The return value is a pointer to the UTF-8 representation of the
 225  *      Unicode string.  Storage for the return value is appended to the
 226  *      end of dsPtr.
 227  *
 228  * Side effects:
 229  *      None.
 230  *
 231  *---------------------------------------------------------------------------
 232  */
 233
 234 char *
 235 Tcl_UniCharToUtfDString(wString, numChars, dsPtr)
 236     CONST Tcl_UniChar *wString; /* Unicode string to convert to UTF-8. */
 237     int numChars;               /* Length of Unicode string in Tcl_UniChars
 238                                  * (must be >= 0). */
 239     Tcl_DString *dsPtr;         /* UTF-8 representation of string is
 240                                  * appended to this previously initialized
 241                                  * DString. */
 242 {
 243     CONST Tcl_UniChar *w, *wEnd;
 244     char *p, *string;
 245     int oldLength;
 246
 247     /*
 248      * UTF-8 string length in bytes will be <= Unicode string length *
 249      * TCL_UTF_MAX.
 250      */
 251
 252     oldLength = Tcl_DStringLength(dsPtr);
 253     Tcl_DStringSetLength(dsPtr, (oldLength + numChars + 1) * TCL_UTF_MAX);
 254     string = Tcl_DStringValue(dsPtr) + oldLength;
 255
 256     p = string;
 257     wEnd = wString + numChars;
 258     for (w = wString; w < wEnd; ) {
 259         p += Tcl_UniCharToUtf(*w, p);
 260         w++;
 261     }
 262     Tcl_DStringSetLength(dsPtr, oldLength + (p - string));
 263
 264     return string;
 265 }
 266 \f
 267 /*
 268  *---------------------------------------------------------------------------
 269  *
 270  * Tcl_UtfToUniChar --
 271  *
 272  *      Extract the Tcl_UniChar represented by the UTF-8 string.  Bad
 273  *      UTF-8 sequences are converted to valid Tcl_UniChars and processing
 274  *      continues.  Equivalent to Plan 9 chartorune().
 275  *
 276  *      The caller must ensure that the source buffer is long enough that
 277  *      this routine does not run off the end and dereference non-existent
 278  *      memory looking for trail bytes.  If the source buffer is known to
 279  *      be '\0' terminated, this cannot happen.  Otherwise, the caller
 280  *      should call Tcl_UtfCharComplete() before calling this routine to
 281  *      ensure that enough bytes remain in the string.
 282  *
 283  * Results:
 284  *      *chPtr is filled with the Tcl_UniChar, and the return value is the
 285  *      number of bytes from the UTF-8 string that were consumed.
 286  *
 287  * Side effects:
 288  *      None.
 289  *
 290  *---------------------------------------------------------------------------
 291  */
 292
 293 int
 294 Tcl_UtfToUniChar(str, chPtr)
 295     register CONST char *str;    /* The UTF-8 string. */
 296     register Tcl_UniChar *chPtr; /* Filled with the Tcl_UniChar represented
 297                                   * by the UTF-8 string. */
 298 {
 299     register int byte;
 300
 301     /*
 302      * Unroll 1 to 3 byte UTF-8 sequences, use loop to handle longer ones.
 303      */
 304
 305     byte = *((unsigned char *) str);
 306     if (byte < 0xC0) {
 307         /*
 308          * Handles properly formed UTF-8 characters between 0x01 and 0x7F.
 309          * Also treats \0 and naked trail bytes 0x80 to 0xBF as valid
 310          * characters representing themselves.
 311          */
 312
 313         *chPtr = (Tcl_UniChar) byte;
 314         return 1;
 315     } else if (byte < 0xE0) {
 316         if ((str[1] & 0xC0) == 0x80) {
 317             /*
 318              * Two-byte-character lead-byte followed by a trail-byte.
 319              */
 320
 321             *chPtr = (Tcl_UniChar) (((byte & 0x1F) << 6) | (str[1] & 0x3F));
 322             return 2;
 323         }
 324         /*
 325          * A two-byte-character lead-byte not followed by trail-byte
 326          * represents itself.
 327          */
 328
 329         *chPtr = (Tcl_UniChar) byte;
 330         return 1;
 331     } else if (byte < 0xF0) {
 332         if (((str[1] & 0xC0) == 0x80) && ((str[2] & 0xC0) == 0x80)) {
 333             /*
 334              * Three-byte-character lead byte followed by two trail bytes.
 335              */
 336
 337             *chPtr = (Tcl_UniChar) (((byte & 0x0F) << 12)
 338                     | ((str[1] & 0x3F) << 6) | (str[2] & 0x3F));
 339             return 3;
 340         }
 341         /*
 342          * A three-byte-character lead-byte not followed by two trail-bytes
 343          * represents itself.
 344          */
 345
 346         *chPtr = (Tcl_UniChar) byte;
 347         return 1;
 348     }
 349 #if TCL_UTF_MAX > 3
 350     else {
 351         int ch, total, trail;
 352
 353         total = totalBytes[byte];
 354         trail = total - 1;
 355         if (trail > 0) {
 356             ch = byte & (0x3F >> trail);
 357             do {
 358                 str++;
 359                 if ((*str & 0xC0) != 0x80) {
 360                     *chPtr = byte;
 361                     return 1;
 362                 }
 363                 ch <<= 6;
 364                 ch |= (*str & 0x3F);
 365                 trail--;
 366             } while (trail > 0);
 367             *chPtr = ch;
 368             return total;
 369         }
 370     }
 371 #endif
 372
 373     *chPtr = (Tcl_UniChar) byte;
 374     return 1;
 375 }
 376 \f
 377 /*
 378  *---------------------------------------------------------------------------
 379  *
 380  * Tcl_UtfToUniCharDString --
 381  *
 382  *      Convert the UTF-8 string to Unicode.
 383  *
 384  * Results:
 385  *      The return value is a pointer to the Unicode representation of the
 386  *      UTF-8 string.  Storage for the return value is appended to the
 387  *      end of dsPtr.  The Unicode string is terminated with a Unicode
 388  *      NULL character.
 389  *
 390  * Side effects:
 391  *      None.
 392  *
 393  *---------------------------------------------------------------------------
 394  */
 395
 396 Tcl_UniChar *
 397 Tcl_UtfToUniCharDString(string, length, dsPtr)
 398     CONST char *string;         /* UTF-8 string to convert to Unicode. */
 399     int length;                 /* Length of UTF-8 string in bytes, or -1
 400                                  * for strlen(). */
 401     Tcl_DString *dsPtr;         /* Unicode representation of string is
 402                                  * appended to this previously initialized
 403                                  * DString. */
 404 {
 405     Tcl_UniChar *w, *wString;
 406     CONST char *p, *end;
 407     int oldLength;
 408
 409     if (length < 0) {
 410         length = strlen(string);
 411     }
 412
 413     /*
 414      * Unicode string length in Tcl_UniChars will be <= UTF-8 string length
 415      * in bytes.
 416      */
 417
 418     oldLength = Tcl_DStringLength(dsPtr);
 419     Tcl_DStringSetLength(dsPtr,
 420             (int) ((oldLength + length + 1) * sizeof(Tcl_UniChar)));
 421     wString = (Tcl_UniChar *) (Tcl_DStringValue(dsPtr) + oldLength);
 422
 423     w = wString;
 424     end = string + length;
 425     for (p = string; p < end; ) {
 426         p += Tcl_UtfToUniChar(p, w);
 427         w++;
 428     }
 429     *w = '\0';
 430     Tcl_DStringSetLength(dsPtr,
 431             (oldLength + ((char *) w - (char *) wString)));
 432
 433     return wString;
 434 }
 435 \f
 436 /*
 437  *---------------------------------------------------------------------------
 438  *
 439  * Tcl_UtfCharComplete --
 440  *
 441  *      Determine if the UTF-8 string of the given length is long enough
 442  *      to be decoded by Tcl_UtfToUniChar().  This does not ensure that the
 443  *      UTF-8 string is properly formed.  Equivalent to Plan 9 fullrune().
 444  *
 445  * Results:
 446  *      The return value is 0 if the string is not long enough, non-zero
 447  *      otherwise.
 448  *
 449  * Side effects:
 450  *      None.
 451  *
 452  *---------------------------------------------------------------------------
 453  */
 454
 455 int
 456 Tcl_UtfCharComplete(str, len)
 457     CONST char *str;            /* String to check if first few bytes
 458                                  * contain a complete UTF-8 character. */
 459     int len;                    /* Length of above string in bytes. */
 460 {
 461     int ch;
 462
 463     ch = *((unsigned char *) str);
 464     return len >= totalBytes[ch];
 465 }
 466 \f
 467 /*
 468  *---------------------------------------------------------------------------
 469  *
 470  * Tcl_NumUtfChars --
 471  *
 472  *      Returns the number of characters (not bytes) in the UTF-8 string,
 473  *      not including the terminating NULL byte.  This is equivalent to
 474  *      Plan 9 utflen() and utfnlen().
 475  *
 476  * Results:
 477  *      As above.
 478  *
 479  * Side effects:
 480  *      None.
 481  *
 482  *---------------------------------------------------------------------------
 483  */
 484
 485 int
 486 Tcl_NumUtfChars(str, len)
 487     register CONST char *str;   /* The UTF-8 string to measure. */
 488     int len;                    /* The length of the string in bytes, or -1
 489                                  * for strlen(string). */
 490 {
 491     Tcl_UniChar ch;
 492     register Tcl_UniChar *chPtr = &ch;
 493     register int n;
 494     int i;
 495
 496     /*
 497      * The separate implementations are faster.
 498      */
 499
 500     i = 0;
 501     if (len < 0) {
 502         while (1) {
 503             str += Tcl_UtfToUniChar(str, chPtr);
 504             if (ch == '\0') {
 505                 break;
 506             }
 507             i++;
 508         }
 509     } else {
 510         while (len > 0) {
 511             n = Tcl_UtfToUniChar(str, chPtr);
 512             len -= n;
 513             str += n;
 514             i++;
 515         }
 516     }
 517     return i;
 518 }
 519 \f
 520 /*
 521  *---------------------------------------------------------------------------
 522  *
 523  * Tcl_UtfFindFirst --
 524  *
 525  *      Returns a pointer to the first occurance of the given Tcl_UniChar
 526  *      in the NULL-terminated UTF-8 string.  The NULL terminator is
 527  *      considered part of the UTF-8 string.  Equivalent to Plan 9
 528  *      utfrune().
 529  *
 530  * Results:
 531  *      As above.  If the Tcl_UniChar does not exist in the given string,
 532  *      the return value is NULL.
 533  *
 534  * Side effects:
 535  *      None.
 536  *
 537  *---------------------------------------------------------------------------
 538  */
 539 CONST char *
 540 Tcl_UtfFindFirst(string, ch)
 541     CONST char *string;         /* The UTF-8 string to be searched. */
 542     int ch;                     /* The Tcl_UniChar to search for. */
 543 {
 544     int len;
 545     Tcl_UniChar find;
 546
 547     while (1) {
 548         len = Tcl_UtfToUniChar(string, &find);
 549         if (find == ch) {
 550             return string;
 551         }
 552         if (*string == '\0') {
 553             return NULL;
 554         }
 555         string += len;
 556     }
 557 }
 558 \f
 559 /*
 560  *---------------------------------------------------------------------------
 561  *
 562  * Tcl_UtfFindLast --
 563  *
 564  *      Returns a pointer to the last occurance of the given Tcl_UniChar
 565  *      in the NULL-terminated UTF-8 string.  The NULL terminator is
 566  *      considered part of the UTF-8 string.  Equivalent to Plan 9
 567  *      utfrrune().
 568  *
 569  * Results:
 570  *      As above.  If the Tcl_UniChar does not exist in the given string,
 571  *      the return value is NULL.
 572  *
 573  * Side effects:
 574  *      None.
 575  *
 576  *---------------------------------------------------------------------------
 577  */
 578
 579 CONST char *
 580 Tcl_UtfFindLast(string, ch)
 581     CONST char *string;         /* The UTF-8 string to be searched. */
 582     int ch;                     /* The Tcl_UniChar to search for. */
 583 {
 584     int len;
 585     Tcl_UniChar find;
 586     CONST char *last;
 587
 588     last = NULL;
 589     while (1) {
 590         len = Tcl_UtfToUniChar(string, &find);
 591         if (find == ch) {
 592             last = string;
 593         }
 594         if (*string == '\0') {
 595             break;
 596         }
 597         string += len;
 598     }
 599     return last;
 600 }
 601 \f
 602 /*
 603  *---------------------------------------------------------------------------
 604  *
 605  * Tcl_UtfNext --
 606  *
 607  *      Given a pointer to some current location in a UTF-8 string,
 608  *      move forward one character.  The caller must ensure that they
 609  *      are not asking for the next character after the last character
 610  *      in the string.
 611  *
 612  * Results:
 613  *      The return value is the pointer to the next character in
 614  *      the UTF-8 string.
 615  *
 616  * Side effects:
 617  *      None.
 618  *
 619  *---------------------------------------------------------------------------
 620  */
 621
 622 CONST char *
 623 Tcl_UtfNext(str)
 624     CONST char *str;                /* The current location in the string. */
 625 {
 626     Tcl_UniChar ch;
 627
 628     return str + Tcl_UtfToUniChar(str, &ch);
 629 }
 630 \f
 631 /*
 632  *---------------------------------------------------------------------------
 633  *
 634  * Tcl_UtfPrev --
 635  *
 636  *      Given a pointer to some current location in a UTF-8 string,
 637  *      move backwards one character.  This works correctly when the
 638  *      pointer is in the middle of a UTF-8 character.
 639  *
 640  * Results:
 641  *      The return value is a pointer to the previous character in the
 642  *      UTF-8 string.  If the current location was already at the
 643  *      beginning of the string, the return value will also be a
 644  *      pointer to the beginning of the string.
 645  *
 646  * Side effects:
 647  *      None.
 648  *
 649  *---------------------------------------------------------------------------
 650  */
 651
 652 CONST char *
 653 Tcl_UtfPrev(str, start)
 654     CONST char *str;                /* The current location in the string. */
 655     CONST char *start;              /* Pointer to the beginning of the
 656                                      * string, to avoid going backwards too
 657                                      * far. */
 658 {
 659     CONST char *look;
 660     int i, byte;
 661
 662     str--;
 663     look = str;
 664     for (i = 0; i < TCL_UTF_MAX; i++) {
 665         if (look < start) {
 666             if (str < start) {
 667                 str = start;
 668             }
 669             break;
 670         }
 671         byte = *((unsigned char *) look);
 672         if (byte < 0x80) {
 673             break;
 674         }
 675         if (byte >= 0xC0) {
 676             return look;
 677         }
 678         look--;
 679     }
 680     return str;
 681 }
 682 \f
 683 /*
 684  *---------------------------------------------------------------------------
 685  *
 686  * Tcl_UniCharAtIndex --
 687  *
 688  *      Returns the Unicode character represented at the specified
 689  *      character (not byte) position in the UTF-8 string.
 690  *
 691  * Results:
 692  *      As above.
 693  *
 694  * Side effects:
 695  *      None.
 696  *
 697  *---------------------------------------------------------------------------
 698  */
 699
 700 Tcl_UniChar
 701 Tcl_UniCharAtIndex(src, index)
 702     register CONST char *src;   /* The UTF-8 string to dereference. */
 703     register int index;         /* The position of the desired character. */
 704 {
 705     Tcl_UniChar ch;
 706
 707     while (index >= 0) {
 708         index--;
 709         src += Tcl_UtfToUniChar(src, &ch);
 710     }
 711     return ch;
 712 }
 713 \f
 714 /*
 715  *---------------------------------------------------------------------------
 716  *
 717  * Tcl_UtfAtIndex --
 718  *
 719  *      Returns a pointer to the specified character (not byte) position
 720  *      in the UTF-8 string.
 721  *
 722  * Results:
 723  *      As above.
 724  *
 725  * Side effects:
 726  *      None.
 727  *
 728  *---------------------------------------------------------------------------
 729  */
 730
 731 CONST char *
 732 Tcl_UtfAtIndex(src, index)
 733     register CONST char *src;   /* The UTF-8 string. */
 734     register int index;         /* The position of the desired character. */
 735 {
 736     Tcl_UniChar ch;
 737
 738     while (index > 0) {
 739         index--;
 740         src += Tcl_UtfToUniChar(src, &ch);
 741     }
 742     return src;
 743 }
 744 \f
 745 /*
 746  *---------------------------------------------------------------------------
 747  *
 748  * Tcl_UtfBackslash --
 749  *
 750  *      Figure out how to handle a backslash sequence.
 751  *
 752  * Results:
 753  *      Stores the bytes represented by the backslash sequence in dst and
 754  *      returns the number of bytes written to dst.  At most TCL_UTF_MAX
 755  *      bytes are written to dst; dst must have been large enough to accept
 756  *      those bytes.  If readPtr isn't NULL then it is filled in with a
 757  *      count of the number of bytes in the backslash sequence.
 758  *
 759  * Side effects:
 760  *      The maximum number of bytes it takes to represent a Unicode
 761  *      character in UTF-8 is guaranteed to be less than the number of
 762  *      bytes used to express the backslash sequence that represents
 763  *      that Unicode character.  If the target buffer into which the
 764  *      caller is going to store the bytes that represent the Unicode
 765  *      character is at least as large as the source buffer from which
 766  *      the backslashed sequence was extracted, no buffer overruns should
 767  *      occur.
 768  *
 769  *---------------------------------------------------------------------------
 770  */
 771
 772 int
 773 Tcl_UtfBackslash(src, readPtr, dst)
 774     CONST char *src;            /* Points to the backslash character of
 775                                  * a backslash sequence. */
 776     int *readPtr;               /* Fill in with number of characters read
 777                                  * from src, unless NULL. */
 778     char *dst;                  /* Filled with the bytes represented by the
 779                                  * backslash sequence. */
 780 {
 781 #define LINE_LENGTH 128
 782     int numRead;
 783     int result;
 784
 785     result = TclParseBackslash(src, LINE_LENGTH, &numRead, dst);
 786     if (numRead == LINE_LENGTH) {
 787         /* We ate a whole line.  Pay the price of a strlen() */
 788         result = TclParseBackslash(src, (int)strlen(src), &numRead, dst);
 789     }
 790     if (readPtr != NULL) {
 791         *readPtr = numRead;
 792     }
 793     return result;
 794 }
 795 \f
 796 /*
 797  *----------------------------------------------------------------------
 798  *
 799  * Tcl_UtfToUpper --
 800  *
 801  *      Convert lowercase characters to uppercase characters in a UTF
 802  *      string in place.  The conversion may shrink the UTF string.
 803  *
 804  * Results:
 805  *      Returns the number of bytes in the resulting string
 806  *      excluding the trailing null.
 807  *
 808  * Side effects:
 809  *      Writes a terminating null after the last converted character.
 810  *
 811  *----------------------------------------------------------------------
 812  */
 813
 814 int
 815 Tcl_UtfToUpper(str)
 816     char *str;                  /* String to convert in place. */
 817 {
 818     Tcl_UniChar ch, upChar;
 819     char *src, *dst;
 820     int bytes;
 821
 822     /*
 823      * Iterate over the string until we hit the terminating null.
 824      */
 825
 826     src = dst = str;
 827     while (*src) {
 828         bytes = Tcl_UtfToUniChar(src, &ch);
 829         upChar = Tcl_UniCharToUpper(ch);
 830
 831         /*
 832          * To keep badly formed Utf strings from getting inflated by
 833          * the conversion (thereby causing a segfault), only copy the
 834          * upper case char to dst if its size is <= the original char.
 835          */
 836
 837         if (bytes < UtfCount(upChar)) {
 838             memcpy(dst, src, (size_t) bytes);
 839             dst += bytes;
 840         } else {
 841             dst += Tcl_UniCharToUtf(upChar, dst);
 842         }
 843         src += bytes;
 844     }
 845     *dst = '\0';
 846     return (dst - str);
 847 }
 848 \f
 849 /*
 850  *----------------------------------------------------------------------
 851  *
 852  * Tcl_UtfToLower --
 853  *
 854  *      Convert uppercase characters to lowercase characters in a UTF
 855  *      string in place.  The conversion may shrink the UTF string.
 856  *
 857  * Results:
 858  *      Returns the number of bytes in the resulting string
 859  *      excluding the trailing null.
 860  *
 861  * Side effects:
 862  *      Writes a terminating null after the last converted character.
 863  *
 864  *----------------------------------------------------------------------
 865  */
 866
 867 int
 868 Tcl_UtfToLower(str)
 869     char *str;                  /* String to convert in place. */
 870 {
 871     Tcl_UniChar ch, lowChar;
 872     char *src, *dst;
 873     int bytes;
 874
 875     /*
 876      * Iterate over the string until we hit the terminating null.
 877      */
 878
 879     src = dst = str;
 880     while (*src) {
 881         bytes = Tcl_UtfToUniChar(src, &ch);
 882         lowChar = Tcl_UniCharToLower(ch);
 883
 884         /*
 885          * To keep badly formed Utf strings from getting inflated by
 886          * the conversion (thereby causing a segfault), only copy the
 887          * lower case char to dst if its size is <= the original char.
 888          */
 889
 890         if (bytes < UtfCount(lowChar)) {
 891             memcpy(dst, src, (size_t) bytes);
 892             dst += bytes;
 893         } else {
 894             dst += Tcl_UniCharToUtf(lowChar, dst);
 895         }
 896         src += bytes;
 897     }
 898     *dst = '\0';
 899     return (dst - str);
 900 }
 901 \f
 902 /*
 903  *----------------------------------------------------------------------
 904  *
 905  * Tcl_UtfToTitle --
 906  *
 907  *      Changes the first character of a UTF string to title case or
 908  *      uppercase and the rest of the string to lowercase.  The
 909  *      conversion happens in place and may shrink the UTF string.
 910  *
 911  * Results:
 912  *      Returns the number of bytes in the resulting string
 913  *      excluding the trailing null.
 914  *
 915  * Side effects:
 916  *      Writes a terminating null after the last converted character.
 917  *
 918  *----------------------------------------------------------------------
 919  */
 920
 921 int
 922 Tcl_UtfToTitle(str)
 923     char *str;                  /* String to convert in place. */
 924 {
 925     Tcl_UniChar ch, titleChar, lowChar;
 926     char *src, *dst;
 927     int bytes;
 928
 929     /*
 930      * Capitalize the first character and then lowercase the rest of the
 931      * characters until we get to a null.
 932      */
 933
 934     src = dst = str;
 935
 936     if (*src) {
 937         bytes = Tcl_UtfToUniChar(src, &ch);
 938         titleChar = Tcl_UniCharToTitle(ch);
 939
 940         if (bytes < UtfCount(titleChar)) {
 941             memcpy(dst, src, (size_t) bytes);
 942             dst += bytes;
 943         } else {
 944             dst += Tcl_UniCharToUtf(titleChar, dst);
 945         }
 946         src += bytes;
 947     }
 948     while (*src) {
 949         bytes = Tcl_UtfToUniChar(src, &ch);
 950         lowChar = Tcl_UniCharToLower(ch);
 951
 952         if (bytes < UtfCount(lowChar)) {
 953             memcpy(dst, src, (size_t) bytes);
 954             dst += bytes;
 955         } else {
 956             dst += Tcl_UniCharToUtf(lowChar, dst);
 957         }
 958         src += bytes;
 959     }
 960     *dst = '\0';
 961     return (dst - str);
 962 }
 963 \f
 964 /*
 965  *----------------------------------------------------------------------
 966  *
 967  * TclpUtfNcmp2 --
 968  *
 969  *      Compare at most n bytes of utf-8 strings cs and ct.  Both cs
 970  *      and ct are assumed to be at least n bytes long.
 971  *
 972  * Results:
 973  *      Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct.
 974  *
 975  * Side effects:
 976  *      None.
 977  *
 978  *----------------------------------------------------------------------
 979  */
 980
 981 int
 982 TclpUtfNcmp2(cs, ct, n)
 983     CONST char *cs;             /* UTF string to compare to ct. */
 984     CONST char *ct;             /* UTF string cs is compared to. */
 985     unsigned long n;            /* Number of *bytes* to compare. */
 986 {
 987     /*
 988      * We can't simply call 'memcmp(cs, ct, n);' because we need to check
 989      * for Tcl's \xC0\x80 non-utf-8 null encoding.
 990      * Otherwise utf-8 lexes fine in the strcmp manner.
 991      */
 992     register int result = 0;
 993
 994     for ( ; n != 0; n--, cs++, ct++) {
 995         if (*cs != *ct) {
 996             result = UCHAR(*cs) - UCHAR(*ct);
 997             break;
 998         }
 999     }
1000     if (n && ((UCHAR(*cs) == 0xC0) || (UCHAR(*ct) == 0xC0))) {
1001         unsigned char c1, c2;
1002         c1 = ((UCHAR(*cs) == 0xC0) && (UCHAR(cs[1]) == 0x80)) ? 0 : UCHAR(*cs);
1003         c2 = ((UCHAR(*ct) == 0xC0) && (UCHAR(ct[1]) == 0x80)) ? 0 : UCHAR(*ct);
1004         result = (c1 - c2);
1005     }
1006     return result;
1007 }
1008 \f
1009 /*
1010  *----------------------------------------------------------------------
1011  *
1012  * Tcl_UtfNcmp --
1013  *
1014  *      Compare at most n UTF chars of string cs to string ct.  Both cs
1015  *      and ct are assumed to be at least n UTF chars long.
1016  *
1017  * Results:
1018  *      Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct.
1019  *
1020  * Side effects:
1021  *      None.
1022  *
1023  *----------------------------------------------------------------------
1024  */
1025
1026 int
1027 Tcl_UtfNcmp(cs, ct, n)
1028     CONST char *cs;             /* UTF string to compare to ct. */
1029     CONST char *ct;             /* UTF string cs is compared to. */
1030     unsigned long n;            /* Number of UTF chars to compare. */
1031 {
1032     Tcl_UniChar ch1, ch2;
1033     /*
1034      * Cannot use 'memcmp(cs, ct, n);' as byte representation of
1035      * \u0000 (the pair of bytes 0xc0,0x80) is larger than byte
1036      * representation of \u0001 (the byte 0x01.)
1037      */
1038     while (n-- > 0) {
1039         /*
1040          * n must be interpreted as chars, not bytes.
1041          * This should be called only when both strings are of
1042          * at least n chars long (no need for \0 check)
1043          */
1044         cs += Tcl_UtfToUniChar(cs, &ch1);
1045         ct += Tcl_UtfToUniChar(ct, &ch2);
1046         if (ch1 != ch2) {
1047             return (ch1 - ch2);
1048         }
1049     }
1050     return 0;
1051 }
1052 \f
1053 /*
1054  *----------------------------------------------------------------------
1055  *
1056  * Tcl_UtfNcasecmp --
1057  *
1058  *      Compare at most n UTF chars of string cs to string ct case
1059  *      insensitive.  Both cs and ct are assumed to be at least n
1060  *      UTF chars long.
1061  *
1062  * Results:
1063  *      Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct.
1064  *
1065  * Side effects:
1066  *      None.
1067  *
1068  *----------------------------------------------------------------------
1069  */
1070
1071 int
1072 Tcl_UtfNcasecmp(cs, ct, n)
1073     CONST char *cs;             /* UTF string to compare to ct. */
1074     CONST char *ct;             /* UTF string cs is compared to. */
1075     unsigned long n;                    /* Number of UTF chars to compare. */
1076 {
1077     Tcl_UniChar ch1, ch2;
1078     while (n-- > 0) {
1079         /*
1080          * n must be interpreted as chars, not bytes.
1081          * This should be called only when both strings are of
1082          * at least n chars long (no need for \0 check)
1083          */
1084         cs += Tcl_UtfToUniChar(cs, &ch1);
1085         ct += Tcl_UtfToUniChar(ct, &ch2);
1086         if (ch1 != ch2) {
1087             ch1 = Tcl_UniCharToLower(ch1);
1088             ch2 = Tcl_UniCharToLower(ch2);
1089             if (ch1 != ch2) {
1090                 return (ch1 - ch2);
1091             }
1092         }
1093     }
1094     return 0;
1095 }
1096 \f
1097 /*
1098  *----------------------------------------------------------------------
1099  *
1100  * Tcl_UniCharToUpper --
1101  *
1102  *      Compute the uppercase equivalent of the given Unicode character.
1103  *
1104  * Results:
1105  *      Returns the uppercase Unicode character.
1106  *
1107  * Side effects:
1108  *      None.
1109  *
1110  *----------------------------------------------------------------------
1111  */
1112
1113 Tcl_UniChar
1114 Tcl_UniCharToUpper(ch)
1115     int ch;                     /* Unicode character to convert. */
1116 {
1117     int info = GetUniCharInfo(ch);
1118
1119     if (GetCaseType(info) & 0x04) {
1120         return (Tcl_UniChar) (ch - GetDelta(info));
1121     } else {
1122         return ch;
1123     }
1124 }
1125 \f
1126 /*
1127  *----------------------------------------------------------------------
1128  *
1129  * Tcl_UniCharToLower --
1130  *
1131  *      Compute the lowercase equivalent of the given Unicode character.
1132  *
1133  * Results:
1134  *      Returns the lowercase Unicode character.
1135  *
1136  * Side effects:
1137  *      None.
1138  *
1139  *----------------------------------------------------------------------
1140  */
1141
1142 Tcl_UniChar
1143 Tcl_UniCharToLower(ch)
1144     int ch;                     /* Unicode character to convert. */
1145 {
1146     int info = GetUniCharInfo(ch);
1147
1148     if (GetCaseType(info) & 0x02) {
1149         return (Tcl_UniChar) (ch + GetDelta(info));
1150     } else {
1151         return ch;
1152     }
1153 }
1154 \f
1155 /*
1156  *----------------------------------------------------------------------
1157  *
1158  * Tcl_UniCharToTitle --
1159  *
1160  *      Compute the titlecase equivalent of the given Unicode character.
1161  *
1162  * Results:
1163  *      Returns the titlecase Unicode character.
1164  *
1165  * Side effects:
1166  *      None.
1167  *
1168  *----------------------------------------------------------------------
1169  */
1170
1171 Tcl_UniChar
1172 Tcl_UniCharToTitle(ch)
1173     int ch;                     /* Unicode character to convert. */
1174 {
1175     int info = GetUniCharInfo(ch);
1176     int mode = GetCaseType(info);
1177
1178     if (mode & 0x1) {
1179         /*
1180          * Subtract or add one depending on the original case.
1181          */
1182
1183         return (Tcl_UniChar) (ch + ((mode & 0x4) ? -1 : 1));
1184     } else if (mode == 0x4) {
1185         return (Tcl_UniChar) (ch - GetDelta(info));
1186     } else {
1187         return ch;
1188     }
1189 }
1190 \f
1191 /*
1192  *----------------------------------------------------------------------
1193  *
1194  * Tcl_UniCharLen --
1195  *
1196  *      Find the length of a UniChar string.  The str input must be null
1197  *      terminated.
1198  *
1199  * Results:
1200  *      Returns the length of str in UniChars (not bytes).
1201  *
1202  * Side effects:
1203  *      None.
1204  *
1205  *----------------------------------------------------------------------
1206  */
1207
1208 int
1209 Tcl_UniCharLen(str)
1210     CONST Tcl_UniChar *str;     /* Unicode string to find length of. */
1211 {
1212     int len = 0;
1213
1214     while (*str != '\0') {
1215         len++;
1216         str++;
1217     }
1218     return len;
1219 }
1220 \f
1221 /*
1222  *----------------------------------------------------------------------
1223  *
1224  * Tcl_UniCharNcmp --
1225  *
1226  *      Compare at most n unichars of string cs to string ct.  Both cs
1227  *      and ct are assumed to be at least n unichars long.
1228  *
1229  * Results:
1230  *      Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct.
1231  *
1232  * Side effects:
1233  *      None.
1234  *
1235  *----------------------------------------------------------------------
1236  */
1237
1238 int
1239 Tcl_UniCharNcmp(cs, ct, n)
1240     CONST Tcl_UniChar *cs;              /* Unicode string to compare to ct. */
1241     CONST Tcl_UniChar *ct;              /* Unicode string cs is compared to. */
1242     unsigned long n;                    /* Number of unichars to compare. */
1243 {
1244 #ifdef WORDS_BIGENDIAN
1245     /*
1246      * We are definitely on a big-endian machine; memcmp() is safe
1247      */
1248     return memcmp(cs, ct, n*sizeof(Tcl_UniChar));
1249
1250 #else /* !WORDS_BIGENDIAN */
1251     /*
1252      * We can't simply call memcmp() because that is not lexically correct.
1253      */
1254     for ( ; n != 0; cs++, ct++, n--) {
1255         if (*cs != *ct) {
1256             return (*cs - *ct);
1257         }
1258     }
1259     return 0;
1260 #endif /* WORDS_BIGENDIAN */
1261 }
1262 \f
1263 /*
1264  *----------------------------------------------------------------------
1265  *
1266  * Tcl_UniCharNcasecmp --
1267  *
1268  *      Compare at most n unichars of string cs to string ct case
1269  *      insensitive.  Both cs and ct are assumed to be at least n
1270  *      unichars long.
1271  *
1272  * Results:
1273  *      Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct.
1274  *
1275  * Side effects:
1276  *      None.
1277  *
1278  *----------------------------------------------------------------------
1279  */
1280
1281 int
1282 Tcl_UniCharNcasecmp(cs, ct, n)
1283     CONST Tcl_UniChar *cs;              /* Unicode string to compare to ct. */
1284     CONST Tcl_UniChar *ct;              /* Unicode string cs is compared to. */
1285     unsigned long n;                    /* Number of unichars to compare. */
1286 {
1287     for ( ; n != 0; n--, cs++, ct++) {
1288         if ((*cs != *ct) &&
1289                 (Tcl_UniCharToLower(*cs) != Tcl_UniCharToLower(*ct))) {
1290             return (*cs - *ct);
1291         }
1292     }
1293     return 0;
1294 }
1295 \f
1296 /*
1297  *----------------------------------------------------------------------
1298  *
1299  * Tcl_UniCharIsAlnum --
1300  *
1301  *      Test if a character is an alphanumeric Unicode character.
1302  *
1303  * Results:
1304  *      Returns 1 if character is alphanumeric.
1305  *
1306  * Side effects:
1307  *      None.
1308  *
1309  *----------------------------------------------------------------------
1310  */
1311
1312 int
1313 Tcl_UniCharIsAlnum(ch)
1314     int ch;                     /* Unicode character to test. */
1315 {
1316     register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
1317
1318     return (((ALPHA_BITS | DIGIT_BITS) >> category) & 1);
1319 }
1320 \f
1321 /*
1322  *----------------------------------------------------------------------
1323  *
1324  * Tcl_UniCharIsAlpha --
1325  *
1326  *      Test if a character is an alphabetic Unicode character.
1327  *
1328  * Results:
1329  *      Returns 1 if character is alphabetic.
1330  *
1331  * Side effects:
1332  *      None.
1333  *
1334  *----------------------------------------------------------------------
1335  */
1336
1337 int
1338 Tcl_UniCharIsAlpha(ch)
1339     int ch;                     /* Unicode character to test. */
1340 {
1341     register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
1342     return ((ALPHA_BITS >> category) & 1);
1343 }
1344 \f
1345 /*
1346  *----------------------------------------------------------------------
1347  *
1348  * Tcl_UniCharIsControl --
1349  *
1350  *      Test if a character is a Unicode control character.
1351  *
1352  * Results:
1353  *      Returns non-zero if character is a control.
1354  *
1355  * Side effects:
1356  *      None.
1357  *
1358  *----------------------------------------------------------------------
1359  */
1360
1361 int
1362 Tcl_UniCharIsControl(ch)
1363     int ch;                     /* Unicode character to test. */
1364 {
1365     return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == CONTROL);
1366 }
1367 \f
1368 /*
1369  *----------------------------------------------------------------------
1370  *
1371  * Tcl_UniCharIsDigit --
1372  *
1373  *      Test if a character is a numeric Unicode character.
1374  *
1375  * Results:
1376  *      Returns non-zero if character is a digit.
1377  *
1378  * Side effects:
1379  *      None.
1380  *
1381  *----------------------------------------------------------------------
1382  */
1383
1384 int
1385 Tcl_UniCharIsDigit(ch)
1386     int ch;                     /* Unicode character to test. */
1387 {
1388     return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK)
1389             == DECIMAL_DIGIT_NUMBER);
1390 }
1391 \f
1392 /*
1393  *----------------------------------------------------------------------
1394  *
1395  * Tcl_UniCharIsGraph --
1396  *
1397  *      Test if a character is any Unicode print character except space.
1398  *
1399  * Results:
1400  *      Returns non-zero if character is printable, but not space.
1401  *
1402  * Side effects:
1403  *      None.
1404  *
1405  *----------------------------------------------------------------------
1406  */
1407
1408 int
1409 Tcl_UniCharIsGraph(ch)
1410     int ch;                     /* Unicode character to test. */
1411 {
1412     register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
1413     return (((PRINT_BITS >> category) & 1) && ((unsigned char) ch != ' '));
1414 }
1415 \f
1416 /*
1417  *----------------------------------------------------------------------
1418  *
1419  * Tcl_UniCharIsLower --
1420  *
1421  *      Test if a character is a lowercase Unicode character.
1422  *
1423  * Results:
1424  *      Returns non-zero if character is lowercase.
1425  *
1426  * Side effects:
1427  *      None.
1428  *
1429  *----------------------------------------------------------------------
1430  */
1431
1432 int
1433 Tcl_UniCharIsLower(ch)
1434     int ch;                     /* Unicode character to test. */
1435 {
1436     return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == LOWERCASE_LETTER);
1437 }
1438 \f
1439 /*
1440  *----------------------------------------------------------------------
1441  *
1442  * Tcl_UniCharIsPrint --
1443  *
1444  *      Test if a character is a Unicode print character.
1445  *
1446  * Results:
1447  *      Returns non-zero if character is printable.
1448  *
1449  * Side effects:
1450  *      None.
1451  *
1452  *----------------------------------------------------------------------
1453  */
1454
1455 int
1456 Tcl_UniCharIsPrint(ch)
1457     int ch;                     /* Unicode character to test. */
1458 {
1459     register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
1460     return ((PRINT_BITS >> category) & 1);
1461 }
1462 \f
1463 /*
1464  *----------------------------------------------------------------------
1465  *
1466  * Tcl_UniCharIsPunct --
1467  *
1468  *      Test if a character is a Unicode punctuation character.
1469  *
1470  * Results:
1471  *      Returns non-zero if character is punct.
1472  *
1473  * Side effects:
1474  *      None.
1475  *
1476  *----------------------------------------------------------------------
1477  */
1478
1479 int
1480 Tcl_UniCharIsPunct(ch)
1481     int ch;                     /* Unicode character to test. */
1482 {
1483     register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
1484     return ((PUNCT_BITS >> category) & 1);
1485 }
1486 \f
1487 /*
1488  *----------------------------------------------------------------------
1489  *
1490  * Tcl_UniCharIsSpace --
1491  *
1492  *      Test if a character is a whitespace Unicode character.
1493  *
1494  * Results:
1495  *      Returns non-zero if character is a space.
1496  *
1497  * Side effects:
1498  *      None.
1499  *
1500  *----------------------------------------------------------------------
1501  */
1502
1503 int
1504 Tcl_UniCharIsSpace(ch)
1505     int ch;                     /* Unicode character to test. */
1506 {
1507     register int category;
1508
1509     /*
1510      * If the character is within the first 127 characters, just use the
1511      * standard C function, otherwise consult the Unicode table.
1512      */
1513
1514     if (ch < 0x80) {
1515         return isspace(UCHAR(ch)); /* INTL: ISO space */
1516     } else {
1517         category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
1518         return ((SPACE_BITS >> category) & 1);
1519     }
1520 }
1521 \f
1522 /*
1523  *----------------------------------------------------------------------
1524  *
1525  * Tcl_UniCharIsUpper --
1526  *
1527  *      Test if a character is a uppercase Unicode character.
1528  *
1529  * Results:
1530  *      Returns non-zero if character is uppercase.
1531  *
1532  * Side effects:
1533  *      None.
1534  *
1535  *----------------------------------------------------------------------
1536  */
1537
1538 int
1539 Tcl_UniCharIsUpper(ch)
1540     int ch;                     /* Unicode character to test. */
1541 {
1542     return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == UPPERCASE_LETTER);
1543 }
1544 \f
1545 /*
1546  *----------------------------------------------------------------------
1547  *
1548  * Tcl_UniCharIsWordChar --
1549  *
1550  *      Test if a character is alphanumeric or a connector punctuation
1551  *      mark.
1552  *
1553  * Results:
1554  *      Returns 1 if character is a word character.
1555  *
1556  * Side effects:
1557  *      None.
1558  *
1559  *----------------------------------------------------------------------
1560  */
1561
1562 int
1563 Tcl_UniCharIsWordChar(ch)
1564     int ch;                     /* Unicode character to test. */
1565 {
1566     register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
1567
1568     return (((ALPHA_BITS | DIGIT_BITS | CONNECTOR_BITS) >> category) & 1);
1569 }
1570 \f
1571 /*
1572  *----------------------------------------------------------------------
1573  *
1574  * Tcl_UniCharCaseMatch --
1575  *
1576  *      See if a particular Unicode string matches a particular pattern.
1577  *      Allows case insensitivity.  This is the Unicode equivalent of
1578  *      the char* Tcl_StringCaseMatch.
1579  *
1580  * Results:
1581  *      The return value is 1 if string matches pattern, and
1582  *      0 otherwise.  The matching operation permits the following
1583  *      special characters in the pattern: *?\[] (see the manual
1584  *      entry for details on what these mean).
1585  *
1586  * Side effects:
1587  *      None.
1588  *
1589  *----------------------------------------------------------------------
1590  */
1591
1592 int
1593 Tcl_UniCharCaseMatch(string, pattern, nocase)
1594     CONST Tcl_UniChar *string;  /* Unicode String. */
1595     CONST Tcl_UniChar *pattern; /* Pattern, which may contain special
1596                                  * characters. */
1597     int nocase;                 /* 0 for case sensitive, 1 for insensitive */
1598 {
1599     Tcl_UniChar ch1, p;
1600
1601     while (1) {
1602         p = *pattern;
1603
1604         /*
1605          * See if we're at the end of both the pattern and the string.  If
1606          * so, we succeeded.  If we're at the end of the pattern but not at
1607          * the end of the string, we failed.
1608          */
1609
1610         if (p == 0) {
1611             return (*string == 0);
1612         }
1613         if ((*string == 0) && (p != '*')) {
1614             return 0;
1615         }
1616
1617         /*
1618          * Check for a "*" as the next pattern character.  It matches any
1619          * substring.  We handle this by skipping all the characters up to the
1620          * next matching one in the pattern, and then calling ourselves
1621          * recursively for each postfix of string, until either we match or we
1622          * reach the end of the string.
1623          */
1624
1625         if (p == '*') {
1626             /*
1627              * Skip all successive *'s in the pattern
1628              */
1629             while (*(++pattern) == '*') {}
1630             p = *pattern;
1631             if (p == 0) {
1632                 return 1;
1633             }
1634             if (nocase) {
1635                 p = Tcl_UniCharToLower(p);
1636             }
1637             while (1) {
1638                 /*
1639                  * Optimization for matching - cruise through the string
1640                  * quickly if the next char in the pattern isn't a special
1641                  * character
1642                  */
1643                 if ((p != '[') && (p != '?') && (p != '\\')) {
1644                     if (nocase) {
1645                         while (*string && (p != *string)
1646                                 && (p != Tcl_UniCharToLower(*string))) {
1647                             string++;
1648                         }
1649                     } else {
1650                         while (*string && (p != *string)) { string++; }
1651                     }
1652                 }
1653                 if (Tcl_UniCharCaseMatch(string, pattern, nocase)) {
1654                     return 1;
1655                 }
1656                 if (*string == 0) {
1657                     return 0;
1658                 }
1659                 string++;
1660             }
1661         }
1662
1663         /*
1664          * Check for a "?" as the next pattern character.  It matches
1665          * any single character.
1666          */
1667
1668         if (p == '?') {
1669             pattern++;
1670             string++;
1671             continue;
1672         }
1673
1674         /*
1675          * Check for a "[" as the next pattern character.  It is followed
1676          * by a list of characters that are acceptable, or by a range
1677          * (two characters separated by "-").
1678          */
1679
1680         if (p == '[') {
1681             Tcl_UniChar startChar, endChar;
1682
1683             pattern++;
1684             ch1 = (nocase ? Tcl_UniCharToLower(*string) : *string);
1685             string++;
1686             while (1) {
1687                 if ((*pattern == ']') || (*pattern == 0)) {
1688                     return 0;
1689                 }
1690                 startChar = (nocase ? Tcl_UniCharToLower(*pattern) : *pattern);
1691                 pattern++;
1692                 if (*pattern == '-') {
1693                     pattern++;
1694                     if (*pattern == 0) {
1695                         return 0;
1696                     }
1697                     endChar = (nocase ? Tcl_UniCharToLower(*pattern)
1698                             : *pattern);
1699                     pattern++;
1700                     if (((startChar <= ch1) && (ch1 <= endChar))
1701                             || ((endChar <= ch1) && (ch1 <= startChar))) {
1702                         /*
1703                          * Matches ranges of form [a-z] or [z-a].
1704                          */
1705                         break;
1706                     }
1707                 } else if (startChar == ch1) {
1708                     break;
1709                 }
1710             }
1711             while (*pattern != ']') {
1712                 if (*pattern == 0) {
1713                     pattern--;
1714                     break;
1715                 }
1716                 pattern++;
1717             }
1718             pattern++;
1719             continue;
1720         }
1721
1722         /*
1723          * If the next pattern character is '\', just strip off the '\'
1724          * so we do exact matching on the character that follows.
1725          */
1726
1727         if (p == '\\') {
1728             if (*(++pattern) == '\0') {
1729                 return 0;
1730             }
1731         }
1732
1733         /*
1734          * There's no special character.  Just make sure that the next
1735          * bytes of each string match.
1736          */
1737
1738         if (nocase) {
1739             if (Tcl_UniCharToLower(*string) != Tcl_UniCharToLower(*pattern)) {
1740                 return 0;
1741             }
1742         } else if (*string != *pattern) {
1743             return 0;
1744         }
1745         string++;
1746         pattern++;
1747     }
1748 }