2 '\" Copyright (c) 1997 Sun Microsystems, Inc.
4 '\" See the file "license.terms" for information on usage and redistribution
5 '\" of this file, and for a DISCLAIMER OF ALL WARRANTIES.
7 .TH Utf 3 "8.1" Tcl "Tcl Library Procedures"
8 .\" The -*- nroff -*- definitions below are for supplemental macros used
9 .\" in Tcl/Tk manual entries.
11 .\" .AP type name in/out ?indent?
12 .\" Start paragraph describing an argument to a library procedure.
13 .\" type is type of argument (int, etc.), in/out is either "in", "out",
14 .\" or "in/out" to describe whether procedure reads or modifies arg,
15 .\" and indent is equivalent to second arg of .IP (shouldn't ever be
16 .\" needed; use .AS below instead)
19 .\" Give maximum sizes of arguments for setting tab stops. Type and
20 .\" name are examples of largest possible arguments that will be passed
21 .\" to .AP later. If args are omitted, default tab stops are used.
24 .\" Start box enclosure. From here until next .BE, everything will be
25 .\" enclosed in one large box.
28 .\" End of box enclosure.
31 .\" Begin code excerpt.
36 .\" .VS ?version? ?br?
37 .\" Begin vertical sidebar, for use in marking newly-changed parts
38 .\" of man pages. The first argument is ignored and used for recording
39 .\" the version when the .VS was added, so that the sidebars can be
40 .\" found and removed when they reach a certain age. If another argument
41 .\" is present, then a line break is forced before starting the sidebar.
44 .\" End of vertical sidebar.
47 .\" Begin an indented unfilled display.
50 .\" End of indented unfilled display.
53 .\" Start of list of standard options for a Tk widget. The manpage
54 .\" argument defines where to look up the standard options; if
55 .\" omitted, defaults to "options". The options follow on successive
56 .\" lines, in three columns separated by tabs.
59 .\" End of list of standard options for a Tk widget.
61 .\" .OP cmdName dbName dbClass
62 .\" Start of description of a specific option. cmdName gives the
63 .\" option's name as specified in the class command, dbName gives
64 .\" the option's name in the option database, and dbClass gives
65 .\" the option's class in the option database.
68 .\" Print arg1 underlined, then print arg2 normally.
71 .\" Print arg1 in quotes, then arg2 normally (for trailing punctuation).
74 .\" Print an open parenthesis, arg1 in quotes, then arg2 normally
75 .\" (for trailing punctuation) and then a closing parenthesis.
77 .\" # Set up traps and other miscellaneous stuff for Tcl/Tk man pages.
81 .\" # Start an argument description
85 . ie !"\\$2"" .TP \\n()Cu
90 \&\\$1 \\fI\\$2\\fP (\\$3)
103 .\" # define tabbing values for .AP
106 .if !"\\$1"" .nr )A \\w'\\$1'u+3n
109 .if !"\\$2"" .nr )B \\w'\\$2'u+\\n()Au+3n
110 .nr )C \\n()Bu+\\w'(in/out)'u+2n
112 .AS Tcl_Interp Tcl_CreateInterp in/out
113 .\" # BS - start boxed text
114 .\" # ^y = starting y location
122 .if n \l'\\n(.lu\(ul'
125 .\" # BE - end boxed text (draw box now)
130 .ie n \l'\\n(^lu\(ul'
132 .\" Draw four-sided box normally, but don't draw top of
133 .\" box if the box started on an earlier page.
135 \h'-1.5n'\L'|\\n(^yu-1v'\l'\\n(^lu+3n\(ul'\L'\\n(^tu+1v-\\n(^yu'\l'|0u-1.5n\(ul'
138 \h'-1.5n'\L'|\\n(^yu-1v'\h'\\n(^lu+3n'\L'\\n(^tu+1v-\\n(^yu'\l'|0u-1.5n\(ul'
145 .\" # VS - start vertical sidebar
146 .\" # ^Y = starting y location
147 .\" # ^v = 1 (for troff; for nroff this doesn't matter)
151 .ie n 'mc \s12\(br\s0
154 .\" # VE - end of vertical sidebar
162 \h'|\\n(^lu+3n'\L'|\\n(^Yu-1v\(bv'\v'\\n(^tu+1v-\\n(^Yu'\h'-|\\n(^lu+3n'
169 .\" # Special macro to handle page bottom: finish off current
170 .\" # box/sidebar if in box/sidebar mode, then invoked standard
171 .\" # page bottom macro.
178 .\" Draw three-sided box if this is the box's first page,
179 .\" draw two sides but no top otherwise.
180 .ie !\\n(^b-1 \h'-1.5n'\L'|\\n(^yu-1v'\l'\\n(^lu+3n\(ul'\L'\\n(^tu+1v-\\n(^yu'\h'|0u'\c
181 .el \h'-1.5n'\L'|\\n(^yu-1v'\h'\\n(^lu+3n'\L'\\n(^tu+1v-\\n(^yu'\h'|0u'\c
184 .nr ^x \\n(^tu+1v-\\n(^Yu
185 \kx\h'-\\nxu'\h'|\\n(^lu+3n'\ky\L'-\\n(^xu'\v'\\n(^xu'\h'|0u'\c
198 .\" # DS - begin display
204 .\" # DE - end display
210 .\" # SO - start of list of standard options
212 'ie '\\$1'' .ds So \\fBoptions\\fR
213 'el .ds So \\fB\\$1\\fR
214 .SH "STANDARD OPTIONS"
220 .\" # SE - end of list of standard options
225 See the \\*(So manual entry for details on the standard options.
227 .\" # OP - start of full description for a single option
232 Command-Line Name: \\fB\\$1\\fR
233 Database Name: \\fB\\$2\\fR
234 Database Class: \\fB\\$3\\fR
238 .\" # CS - begin code excerpt
244 .\" # CE - end code excerpt
249 .\" # UL - underline word
253 .\" # QW - apply quotation marks to word
255 .ie '\\*(lq'"' ``\\$1''\\$2
256 .\"" fix emacs highlighting
257 .el \\*(lq\\$1\\*(rq\\$2
259 .\" # PQ - apply parens and quotation marks to word
261 .ie '\\*(lq'"' (``\\$1''\\$2)\\$3
262 .\"" fix emacs highlighting
263 .el (\\*(lq\\$1\\*(rq\\$2)\\$3
265 .\" # QR - quoted range
267 .ie '\\*(lq'"' ``\\$1''\\-``\\$2''\\$3
268 .\"" fix emacs highlighting
269 .el \\*(lq\\$1\\*(rq\\-\\*(lq\\$2\\*(rq\\$3
271 .\" # MT - "empty" string
277 Tcl_UniChar, Tcl_UniCharCaseMatch, Tcl_UniCharNcasecmp, Tcl_UniCharToUtf, Tcl_UtfToUniChar, Tcl_UniCharToUtfDString, Tcl_UtfToUniCharDString, Tcl_UniCharLen, Tcl_UniCharNcmp, Tcl_UtfCharComplete, Tcl_NumUtfChars, Tcl_UtfFindFirst, Tcl_UtfFindLast, Tcl_UtfNext, Tcl_UtfPrev, Tcl_UniCharAtIndex, Tcl_UtfAtIndex, Tcl_UtfBackslash \- routines for manipulating UTF-8 strings
280 \fB#include <tcl.h>\fR
282 typedef ... \fBTcl_UniChar\fR;
285 \fBTcl_UniCharToUtf\fR(\fIch, buf\fR)
288 \fBTcl_UtfToUniChar\fR(\fIsrc, chPtr\fR)
291 \fBTcl_UniCharToUtfDString\fR(\fIuniStr, uniLength, dsPtr\fR)
294 \fBTcl_UtfToUniCharDString\fR(\fIsrc, length, dsPtr\fR)
297 \fBTcl_UniCharLen\fR(\fIuniStr\fR)
300 \fBTcl_UniCharNcmp\fR(\fIucs, uct, numChars\fR)
303 \fBTcl_UniCharNcasecmp\fR(\fIucs, uct, numChars\fR)
306 \fBTcl_UniCharCaseMatch\fR(\fIuniStr, uniPattern, nocase\fR)
309 \fBTcl_UtfNcmp\fR(\fIcs, ct, numChars\fR)
312 \fBTcl_UtfNcasecmp\fR(\fIcs, ct, numChars\fR)
315 \fBTcl_UtfCharComplete\fR(\fIsrc, length\fR)
318 \fBTcl_NumUtfChars\fR(\fIsrc, length\fR)
321 \fBTcl_UtfFindFirst\fR(\fIsrc, ch\fR)
324 \fBTcl_UtfFindLast\fR(\fIsrc, ch\fR)
327 \fBTcl_UtfNext\fR(\fIsrc\fR)
330 \fBTcl_UtfPrev\fR(\fIsrc, start\fR)
333 \fBTcl_UniCharAtIndex\fR(\fIsrc, index\fR)
336 \fBTcl_UtfAtIndex\fR(\fIsrc, index\fR)
339 \fBTcl_UtfBackslash\fR(\fIsrc, readPtr, dst\fR)
341 .AS "const Tcl_UniChar" *uniPattern in/out
343 Buffer in which the UTF-8 representation of the Tcl_UniChar is stored. At most
344 \fBTCL_UTF_MAX\fR bytes are stored in the buffer.
346 The Tcl_UniChar to be converted or examined.
347 .AP Tcl_UniChar *chPtr out
348 Filled with the Tcl_UniChar represented by the head of the UTF-8 string.
349 .AP "const char" *src in
350 Pointer to a UTF-8 string.
351 .AP "const char" *cs in
352 Pointer to a UTF-8 string.
353 .AP "const char" *ct in
354 Pointer to a UTF-8 string.
355 .AP "const Tcl_UniChar" *uniStr in
356 A null-terminated Unicode string.
357 .AP "const Tcl_UniChar" *ucs in
358 A null-terminated Unicode string.
359 .AP "const Tcl_UniChar" *uct in
360 A null-terminated Unicode string.
361 .AP "const Tcl_UniChar" *uniPattern in
362 A null-terminated Unicode string.
364 The length of the UTF-8 string in bytes (not UTF-8 characters). If
365 negative, all bytes up to the first null byte are used.
367 The length of the Unicode string in characters. Must be greater than or
369 .AP "Tcl_DString" *dsPtr in/out
370 A pointer to a previously initialized \fBTcl_DString\fR.
371 .AP "unsigned long" numChars in
372 The number of characters to compare.
373 .AP "const char" *start in
374 Pointer to the beginning of a UTF-8 string.
376 The index of a character (not byte) in the UTF-8 string.
378 If non-NULL, filled with the number of bytes in the backslash sequence,
379 including the backslash character.
381 Buffer in which the bytes represented by the backslash sequence are stored.
382 At most \fBTCL_UTF_MAX\fR bytes are stored in the buffer.
384 Specifies whether the match should be done case-sensitive (0) or
385 case-insensitive (1).
390 These routines convert between UTF-8 strings and Tcl_UniChars. A
391 Tcl_UniChar is a Unicode character represented as an unsigned, fixed-size
392 quantity. A UTF-8 character is a Unicode character represented as
393 a varying-length sequence of up to \fBTCL_UTF_MAX\fR bytes. A multibyte UTF-8
394 sequence consists of a lead byte followed by some number of trail bytes.
396 \fBTCL_UTF_MAX\fR is the maximum number of bytes that it takes to
397 represent one Unicode character in the UTF-8 representation.
399 \fBTcl_UniCharToUtf\fR stores the Tcl_UniChar \fIch\fR as a UTF-8 string
400 in starting at \fIbuf\fR. The return value is the number of bytes stored
403 \fBTcl_UtfToUniChar\fR reads one UTF-8 character starting at \fIsrc\fR
404 and stores it as a Tcl_UniChar in \fI*chPtr\fR. The return value is the
405 number of bytes read from \fIsrc\fR. The caller must ensure that the
406 source buffer is long enough such that this routine does not run off the
407 end and dereference non-existent or random memory; if the source buffer
408 is known to be null-terminated, this will not happen. If the input is
409 not in proper UTF-8 format, \fBTcl_UtfToUniChar\fR will store the first
410 byte of \fIsrc\fR in \fI*chPtr\fR as a Tcl_UniChar between 0x0000 and
413 \fBTcl_UniCharToUtfDString\fR converts the given Unicode string
414 to UTF-8, storing the result in a previously initialized \fBTcl_DString\fR.
415 You must specify \fIuniLength\fR, the length of the given Unicode string.
416 The return value is a pointer to the UTF-8 representation of the
417 Unicode string. Storage for the return value is appended to the
418 end of the \fBTcl_DString\fR.
420 \fBTcl_UtfToUniCharDString\fR converts the given UTF-8 string to Unicode,
421 storing the result in the previously initialized \fBTcl_DString\fR.
422 In the argument \fIlength\fR, you may either specify the length of
423 the given UTF-8 string in bytes or
425 in which case \fBTcl_UtfToUniCharDString\fR uses \fBstrlen\fR to
426 calculate the length. The return value is a pointer to the Unicode
427 representation of the UTF-8 string. Storage for the return value
428 is appended to the end of the \fBTcl_DString\fR. The Unicode string
429 is terminated with a Unicode null character.
431 \fBTcl_UniCharLen\fR corresponds to \fBstrlen\fR for Unicode
432 characters. It accepts a null-terminated Unicode string and returns
433 the number of Unicode characters (not bytes) in that string.
435 \fBTcl_UniCharNcmp\fR and \fBTcl_UniCharNcasecmp\fR correspond to
436 \fBstrncmp\fR and \fBstrncasecmp\fR, respectively, for Unicode characters.
437 They accept two null-terminated Unicode strings and the number of characters
438 to compare. Both strings are assumed to be at least \fInumChars\fR characters
439 long. \fBTcl_UniCharNcmp\fR compares the two strings character-by-character
440 according to the Unicode character ordering. It returns an integer greater
441 than, equal to, or less than 0 if the first string is greater than, equal
442 to, or less than the second string respectively. \fBTcl_UniCharNcasecmp\fR
443 is the Unicode case insensitive version.
445 \fBTcl_UniCharCaseMatch\fR is the Unicode equivalent to
446 \fBTcl_StringCaseMatch\fR. It accepts a null-terminated Unicode string,
447 a Unicode pattern, and a boolean value specifying whether the match should
448 be case sensitive and returns whether the string matches the pattern.
450 \fBTcl_UtfNcmp\fR corresponds to \fBstrncmp\fR for UTF-8 strings. It
451 accepts two null-terminated UTF-8 strings and the number of characters
452 to compare. (Both strings are assumed to be at least \fInumChars\fR
453 characters long.) \fBTcl_UtfNcmp\fR compares the two strings
454 character-by-character according to the Unicode character ordering.
455 It returns an integer greater than, equal to, or less than 0 if the
456 first string is greater than, equal to, or less than the second string
459 \fBTcl_UtfNcasecmp\fR corresponds to \fBstrncasecmp\fR for UTF-8
460 strings. It is similar to \fBTcl_UtfNcmp\fR except comparisons ignore
461 differences in case when comparing upper, lower or title case
464 \fBTcl_UtfCharComplete\fR returns 1 if the source UTF-8 string \fIsrc\fR
465 of \fIlength\fR bytes is long enough to be decoded by
466 \fBTcl_UtfToUniChar\fR, or 0 otherwise. This function does not guarantee
467 that the UTF-8 string is properly formed. This routine is used by
468 procedures that are operating on a byte at a time and need to know if a
469 full Tcl_UniChar has been seen.
471 \fBTcl_NumUtfChars\fR corresponds to \fBstrlen\fR for UTF-8 strings. It
472 returns the number of Tcl_UniChars that are represented by the UTF-8 string
473 \fIsrc\fR. The length of the source string is \fIlength\fR bytes. If the
474 length is negative, all bytes up to the first null byte are used.
476 \fBTcl_UtfFindFirst\fR corresponds to \fBstrchr\fR for UTF-8 strings. It
477 returns a pointer to the first occurrence of the Tcl_UniChar \fIch\fR
478 in the null-terminated UTF-8 string \fIsrc\fR. The null terminator is
479 considered part of the UTF-8 string.
481 \fBTcl_UtfFindLast\fR corresponds to \fBstrrchr\fR for UTF-8 strings. It
482 returns a pointer to the last occurrence of the Tcl_UniChar \fIch\fR
483 in the null-terminated UTF-8 string \fIsrc\fR. The null terminator is
484 considered part of the UTF-8 string.
486 Given \fIsrc\fR, a pointer to some location in a UTF-8 string,
487 \fBTcl_UtfNext\fR returns a pointer to the next UTF-8 character in the
488 string. The caller must not ask for the next character after the last
489 character in the string if the string is not terminated by a null
492 Given \fIsrc\fR, a pointer to some location in a UTF-8 string (or to a
493 null byte immediately following such a string), \fBTcl_UtfPrev\fR
494 returns a pointer to the closest preceding byte that starts a UTF-8
496 This function will not back up to a position before \fIstart\fR,
497 the start of the UTF-8 string. If \fIsrc\fR was already at \fIstart\fR, the
498 return value will be \fIstart\fR.
500 \fBTcl_UniCharAtIndex\fR corresponds to a C string array dereference or the
501 Pascal Ord() function. It returns the Tcl_UniChar represented at the
502 specified character (not byte) \fIindex\fR in the UTF-8 string
503 \fIsrc\fR. The source string must contain at least \fIindex\fR
504 characters. Behavior is undefined if a negative \fIindex\fR is given.
506 \fBTcl_UtfAtIndex\fR returns a pointer to the specified character (not
507 byte) \fIindex\fR in the UTF-8 string \fIsrc\fR. The source string must
508 contain at least \fIindex\fR characters. This is equivalent to calling
509 \fBTcl_UtfNext\fR \fIindex\fR times. If a negative \fIindex\fR is given,
510 the return pointer points to the first character in the source string.
512 \fBTcl_UtfBackslash\fR is a utility procedure used by several of the Tcl
513 commands. It parses a backslash sequence and stores the properly formed
514 UTF-8 character represented by the backslash sequence in the output
515 buffer \fIdst\fR. At most \fBTCL_UTF_MAX\fR bytes are stored in the buffer.
516 \fBTcl_UtfBackslash\fR modifies \fI*readPtr\fR to contain the number
517 of bytes in the backslash sequence, including the backslash character.
518 The return value is the number of bytes stored in the output buffer.
520 See the \fBTcl\fR manual entry for information on the valid backslash
521 sequences. All of the sequences described in the Tcl manual entry are
522 supported by \fBTcl_UtfBackslash\fR.
525 utf, unicode, backslash