util/X86LINUX64/man/mann/re_syntax.n

   1 '\"
   2 '\" Copyright (c) 1998 Sun Microsystems, Inc.
   3 '\" Copyright (c) 1999 Scriptics Corporation
   4 '\"
   5 '\" See the file "license.terms" for information on usage and redistribution
   6 '\" of this file, and for a DISCLAIMER OF ALL WARRANTIES.
   7 '\"
   8 .\" The -*- nroff -*- definitions below are for supplemental macros used
   9 .\" in Tcl/Tk manual entries.
  10 .\"
  11 .\" .AP type name in/out ?indent?
  12 .\"     Start paragraph describing an argument to a library procedure.
  13 .\"     type is type of argument (int, etc.), in/out is either "in", "out",
  14 .\"     or "in/out" to describe whether procedure reads or modifies arg,
  15 .\"     and indent is equivalent to second arg of .IP (shouldn't ever be
  16 .\"     needed;  use .AS below instead)
  17 .\"
  18 .\" .AS ?type? ?name?
  19 .\"     Give maximum sizes of arguments for setting tab stops.  Type and
  20 .\"     name are examples of largest possible arguments that will be passed
  21 .\"     to .AP later.  If args are omitted, default tab stops are used.
  22 .\"
  23 .\" .BS
  24 .\"     Start box enclosure.  From here until next .BE, everything will be
  25 .\"     enclosed in one large box.
  26 .\"
  27 .\" .BE
  28 .\"     End of box enclosure.
  29 .\"
  30 .\" .CS
  31 .\"     Begin code excerpt.
  32 .\"
  33 .\" .CE
  34 .\"     End code excerpt.
  35 .\"
  36 .\" .VS ?version? ?br?
  37 .\"     Begin vertical sidebar, for use in marking newly-changed parts
  38 .\"     of man pages.  The first argument is ignored and used for recording
  39 .\"     the version when the .VS was added, so that the sidebars can be
  40 .\"     found and removed when they reach a certain age.  If another argument
  41 .\"     is present, then a line break is forced before starting the sidebar.
  42 .\"
  43 .\" .VE
  44 .\"     End of vertical sidebar.
  45 .\"
  46 .\" .DS
  47 .\"     Begin an indented unfilled display.
  48 .\"
  49 .\" .DE
  50 .\"     End of indented unfilled display.
  51 .\"
  52 .\" .SO ?manpage?
  53 .\"     Start of list of standard options for a Tk widget. The manpage
  54 .\"     argument defines where to look up the standard options; if
  55 .\"     omitted, defaults to "options". The options follow on successive
  56 .\"     lines, in three columns separated by tabs.
  57 .\"
  58 .\" .SE
  59 .\"     End of list of standard options for a Tk widget.
  60 .\"
  61 .\" .OP cmdName dbName dbClass
  62 .\"     Start of description of a specific option.  cmdName gives the
  63 .\"     option's name as specified in the class command, dbName gives
  64 .\"     the option's name in the option database, and dbClass gives
  65 .\"     the option's class in the option database.
  66 .\"
  67 .\" .UL arg1 arg2
  68 .\"     Print arg1 underlined, then print arg2 normally.
  69 .\"
  70 .\" .QW arg1 ?arg2?
  71 .\"     Print arg1 in quotes, then arg2 normally (for trailing punctuation).
  72 .\"
  73 .\" .PQ arg1 ?arg2?
  74 .\"     Print an open parenthesis, arg1 in quotes, then arg2 normally
  75 .\"     (for trailing punctuation) and then a closing parenthesis.
  76 .\"
  77 .\"     # Set up traps and other miscellaneous stuff for Tcl/Tk man pages.
  78 .if t .wh -1.3i ^B
  79 .nr ^l \n(.l
  80 .ad b
  81 .\"     # Start an argument description
  82 .de AP
  83 .ie !"\\$4"" .TP \\$4
  84 .el \{\
  85 .   ie !"\\$2"" .TP \\n()Cu
  86 .   el          .TP 15
  87 .\}
  88 .ta \\n()Au \\n()Bu
  89 .ie !"\\$3"" \{\
  90 \&\\$1 \\fI\\$2\\fP (\\$3)
  91 .\".b
  92 .\}
  93 .el \{\
  94 .br
  95 .ie !"\\$2"" \{\
  96 \&\\$1  \\fI\\$2\\fP
  97 .\}
  98 .el \{\
  99 \&\\fI\\$1\\fP
 100 .\}
 101 .\}
 102 ..
 103 .\"     # define tabbing values for .AP
 104 .de AS
 105 .nr )A 10n
 106 .if !"\\$1"" .nr )A \\w'\\$1'u+3n
 107 .nr )B \\n()Au+15n
 108 .\"
 109 .if !"\\$2"" .nr )B \\w'\\$2'u+\\n()Au+3n
 110 .nr )C \\n()Bu+\\w'(in/out)'u+2n
 111 ..
 112 .AS Tcl_Interp Tcl_CreateInterp in/out
 113 .\"     # BS - start boxed text
 114 .\"     # ^y = starting y location
 115 .\"     # ^b = 1
 116 .de BS
 117 .br
 118 .mk ^y
 119 .nr ^b 1u
 120 .if n .nf
 121 .if n .ti 0
 122 .if n \l'\\n(.lu\(ul'
 123 .if n .fi
 124 ..
 125 .\"     # BE - end boxed text (draw box now)
 126 .de BE
 127 .nf
 128 .ti 0
 129 .mk ^t
 130 .ie n \l'\\n(^lu\(ul'
 131 .el \{\
 132 .\"     Draw four-sided box normally, but don't draw top of
 133 .\"     box if the box started on an earlier page.
 134 .ie !\\n(^b-1 \{\
 135 \h'-1.5n'\L'|\\n(^yu-1v'\l'\\n(^lu+3n\(ul'\L'\\n(^tu+1v-\\n(^yu'\l'|0u-1.5n\(ul'
 136 .\}
 137 .el \}\
 138 \h'-1.5n'\L'|\\n(^yu-1v'\h'\\n(^lu+3n'\L'\\n(^tu+1v-\\n(^yu'\l'|0u-1.5n\(ul'
 139 .\}
 140 .\}
 141 .fi
 142 .br
 143 .nr ^b 0
 144 ..
 145 .\"     # VS - start vertical sidebar
 146 .\"     # ^Y = starting y location
 147 .\"     # ^v = 1 (for troff;  for nroff this doesn't matter)
 148 .de VS
 149 .if !"\\$2"" .br
 150 .mk ^Y
 151 .ie n 'mc \s12\(br\s0
 152 .el .nr ^v 1u
 153 ..
 154 .\"     # VE - end of vertical sidebar
 155 .de VE
 156 .ie n 'mc
 157 .el \{\
 158 .ev 2
 159 .nf
 160 .ti 0
 161 .mk ^t
 162 \h'|\\n(^lu+3n'\L'|\\n(^Yu-1v\(bv'\v'\\n(^tu+1v-\\n(^Yu'\h'-|\\n(^lu+3n'
 163 .sp -1
 164 .fi
 165 .ev
 166 .\}
 167 .nr ^v 0
 168 ..
 169 .\"     # Special macro to handle page bottom:  finish off current
 170 .\"     # box/sidebar if in box/sidebar mode, then invoked standard
 171 .\"     # page bottom macro.
 172 .de ^B
 173 .ev 2
 174 'ti 0
 175 'nf
 176 .mk ^t
 177 .if \\n(^b \{\
 178 .\"     Draw three-sided box if this is the box's first page,
 179 .\"     draw two sides but no top otherwise.
 180 .ie !\\n(^b-1 \h'-1.5n'\L'|\\n(^yu-1v'\l'\\n(^lu+3n\(ul'\L'\\n(^tu+1v-\\n(^yu'\h'|0u'\c
 181 .el \h'-1.5n'\L'|\\n(^yu-1v'\h'\\n(^lu+3n'\L'\\n(^tu+1v-\\n(^yu'\h'|0u'\c
 182 .\}
 183 .if \\n(^v \{\
 184 .nr ^x \\n(^tu+1v-\\n(^Yu
 185 \kx\h'-\\nxu'\h'|\\n(^lu+3n'\ky\L'-\\n(^xu'\v'\\n(^xu'\h'|0u'\c
 186 .\}
 187 .bp
 188 'fi
 189 .ev
 190 .if \\n(^b \{\
 191 .mk ^y
 192 .nr ^b 2
 193 .\}
 194 .if \\n(^v \{\
 195 .mk ^Y
 196 .\}
 197 ..
 198 .\"     # DS - begin display
 199 .de DS
 200 .RS
 201 .nf
 202 .sp
 203 ..
 204 .\"     # DE - end display
 205 .de DE
 206 .fi
 207 .RE
 208 .sp
 209 ..
 210 .\"     # SO - start of list of standard options
 211 .de SO
 212 'ie '\\$1'' .ds So \\fBoptions\\fR
 213 'el .ds So \\fB\\$1\\fR
 214 .SH "STANDARD OPTIONS"
 215 .LP
 216 .nf
 217 .ta 5.5c 11c
 218 .ft B
 219 ..
 220 .\"     # SE - end of list of standard options
 221 .de SE
 222 .fi
 223 .ft R
 224 .LP
 225 See the \\*(So manual entry for details on the standard options.
 226 ..
 227 .\"     # OP - start of full description for a single option
 228 .de OP
 229 .LP
 230 .nf
 231 .ta 4c
 232 Command-Line Name:      \\fB\\$1\\fR
 233 Database Name:  \\fB\\$2\\fR
 234 Database Class: \\fB\\$3\\fR
 235 .fi
 236 .IP
 237 ..
 238 .\"     # CS - begin code excerpt
 239 .de CS
 240 .RS
 241 .nf
 242 .ta .25i .5i .75i 1i
 243 ..
 244 .\"     # CE - end code excerpt
 245 .de CE
 246 .fi
 247 .RE
 248 ..
 249 .\"     # UL - underline word
 250 .de UL
 251 \\$1\l'|0\(ul'\\$2
 252 ..
 253 .\"     # QW - apply quotation marks to word
 254 .de QW
 255 .ie '\\*(lq'"' ``\\$1''\\$2
 256 .\"" fix emacs highlighting
 257 .el \\*(lq\\$1\\*(rq\\$2
 258 ..
 259 .\"     # PQ - apply parens and quotation marks to word
 260 .de PQ
 261 .ie '\\*(lq'"' (``\\$1''\\$2)\\$3
 262 .\"" fix emacs highlighting
 263 .el (\\*(lq\\$1\\*(rq\\$2)\\$3
 264 ..
 265 .\"     # QR - quoted range
 266 .de QR
 267 .ie '\\*(lq'"' ``\\$1''\\-``\\$2''\\$3
 268 .\"" fix emacs highlighting
 269 .el \\*(lq\\$1\\*(rq\\-\\*(lq\\$2\\*(rq\\$3
 270 ..
 271 .\"     # MT - "empty" string
 272 .de MT
 273 .QW ""
 274 ..
 275 .ie '\w'o''\w'\C'^o''' .ds qo \C'^o'
 276 .el .ds qo u
 277 .TH re_syntax n "8.1" Tcl "Tcl Built-In Commands"
 278 .BS
 279 .SH NAME
 280 re_syntax \- Syntax of Tcl regular expressions
 281 .BE
 282 .SH DESCRIPTION
 283 .PP
 284 A \fIregular expression\fR describes strings of characters.
 285 It's a pattern that matches certain strings and does not match others.
 286 .SH "DIFFERENT FLAVORS OF REs"
 287 Regular expressions
 288 .PQ RE s ,
 289 as defined by POSIX, come in two flavors: \fIextended\fR REs
 290 .PQ ERE s
 291 and \fIbasic\fR REs
 292 .PQ BRE s .
 293 EREs are roughly those of the traditional \fIegrep\fR, while BREs are
 294 roughly those of the traditional \fIed\fR. This implementation adds
 295 a third flavor, \fIadvanced\fR REs
 296 .PQ ARE s ,
 297 basically EREs with some significant extensions.
 298 .PP
 299 This manual page primarily describes AREs. BREs mostly exist for
 300 backward compatibility in some old programs; they will be discussed at
 301 the end. POSIX EREs are almost an exact subset of AREs. Features of
 302 AREs that are not present in EREs will be indicated.
 303 .SH "REGULAR EXPRESSION SYNTAX"
 304 .PP
 305 Tcl regular expressions are implemented using the package written by
 306 Henry Spencer, based on the 1003.2 spec and some (not quite all) of
 307 the Perl5 extensions (thanks, Henry!). Much of the description of
 308 regular expressions below is copied verbatim from his manual entry.
 309 .PP
 310 An ARE is one or more \fIbranches\fR,
 311 separated by
 312 .QW \fB|\fR ,
 313 matching anything that matches any of the branches.
 314 .PP
 315 A branch is zero or more \fIconstraints\fR or \fIquantified atoms\fR,
 316 concatenated.
 317 It matches a match for the first, followed by a match for the second, etc;
 318 an empty branch matches the empty string.
 319 .SS QUANTIFIERS
 320 A quantified atom is an \fIatom\fR possibly followed
 321 by a single \fIquantifier\fR.
 322 Without a quantifier, it matches a single match for the atom.
 323 The quantifiers,
 324 and what a so-quantified atom matches, are:
 325 .RS 2
 326 .TP 6
 327 \fB*\fR
 328 .
 329 a sequence of 0 or more matches of the atom
 330 .TP
 331 \fB+\fR
 332 .
 333 a sequence of 1 or more matches of the atom
 334 .TP
 335 \fB?\fR
 336 .
 337 a sequence of 0 or 1 matches of the atom
 338 .TP
 339 \fB{\fIm\fB}\fR
 340 .
 341 a sequence of exactly \fIm\fR matches of the atom
 342 .TP
 343 \fB{\fIm\fB,}\fR
 344 .
 345 a sequence of \fIm\fR or more matches of the atom
 346 .TP
 347 \fB{\fIm\fB,\fIn\fB}\fR
 348 .
 349 a sequence of \fIm\fR through \fIn\fR (inclusive) matches of the atom;
 350 \fIm\fR may not exceed \fIn\fR
 351 .TP
 352 \fB*?  +?  ??  {\fIm\fB}?  {\fIm\fB,}?  {\fIm\fB,\fIn\fB}?\fR
 353 .
 354 \fInon-greedy\fR quantifiers, which match the same possibilities,
 355 but prefer the smallest number rather than the largest number
 356 of matches (see \fBMATCHING\fR)
 357 .RE
 358 .PP
 359 The forms using \fB{\fR and \fB}\fR are known as \fIbound\fRs. The
 360 numbers \fIm\fR and \fIn\fR are unsigned decimal integers with
 361 permissible values from 0 to 255 inclusive.
 362 .SS ATOMS
 363 An atom is one of:
 364 .RS 2
 365 .IP \fB(\fIre\fB)\fR 6
 366 matches a match for \fIre\fR (\fIre\fR is any regular expression) with
 367 the match noted for possible reporting
 368 .IP \fB(?:\fIre\fB)\fR
 369 as previous, but does no reporting (a
 370 .QW non-capturing
 371 set of parentheses)
 372 .IP \fB()\fR
 373 matches an empty string, noted for possible reporting
 374 .IP \fB(?:)\fR
 375 matches an empty string, without reporting
 376 .IP \fB[\fIchars\fB]\fR
 377 a \fIbracket expression\fR, matching any one of the \fIchars\fR (see
 378 \fBBRACKET EXPRESSIONS\fR for more detail)
 379 .IP \fB.\fR
 380 matches any single character
 381 .IP \fB\e\fIk\fR
 382 matches the non-alphanumeric character \fIk\fR
 383 taken as an ordinary character, e.g. \fB\e\e\fR matches a backslash
 384 character
 385 .IP \fB\e\fIc\fR
 386 where \fIc\fR is alphanumeric (possibly followed by other characters),
 387 an \fIescape\fR (AREs only), see \fBESCAPES\fR below
 388 .IP \fB{\fR
 389 when followed by a character other than a digit, matches the
 390 left-brace character
 391 .QW \fB{\fR ;
 392 when followed by a digit, it is the beginning of a \fIbound\fR (see above)
 393 .IP \fIx\fR
 394 where \fIx\fR is a single character with no other significance,
 395 matches that character.
 396 .RE
 397 .SS CONSTRAINTS
 398 A \fIconstraint\fR matches an empty string when specific conditions
 399 are met. A constraint may not be followed by a quantifier. The
 400 simple constraints are as follows; some more constraints are described
 401 later, under \fBESCAPES\fR.
 402 .RS 2
 403 .TP 8
 404 \fB^\fR
 405 .
 406 matches at the beginning of a line
 407 .TP
 408 \fB$\fR
 409 .
 410 matches at the end of a line
 411 .TP
 412 \fB(?=\fIre\fB)\fR
 413 .
 414 \fIpositive lookahead\fR (AREs only), matches at any point where a
 415 substring matching \fIre\fR begins
 416 .TP
 417 \fB(?!\fIre\fB)\fR
 418 .
 419 \fInegative lookahead\fR (AREs only), matches at any point where no
 420 substring matching \fIre\fR begins
 421 .RE
 422 .PP
 423 The lookahead constraints may not contain back references (see later),
 424 and all parentheses within them are considered non-capturing.
 425 .PP
 426 An RE may not end with
 427 .QW \fB\e\fR .
 428 .SH "BRACKET EXPRESSIONS"
 429 A \fIbracket expression\fR is a list of characters enclosed in
 430 .QW \fB[\|]\fR .
 431 It normally matches any single character from the list
 432 (but see below). If the list begins with
 433 .QW \fB^\fR ,
 434 it matches any single character (but see below) \fInot\fR from the
 435 rest of the list.
 436 .PP
 437 If two characters in the list are separated by
 438 .QW \fB\-\fR ,
 439 this is shorthand for the full \fIrange\fR of characters between those two
 440 (inclusive) in the collating sequence, e.g.
 441 .QW \fB[0\-9]\fR
 442 in Unicode matches any conventional decimal digit. Two ranges may not share an
 443 endpoint, so e.g.
 444 .QW \fBa\-c\-e\fR
 445 is illegal. Ranges in Tcl always use the
 446 Unicode collating sequence, but other programs may use other collating
 447 sequences and this can be a source of incompatibility between programs.
 448 .PP
 449 To include a literal \fB]\fR or \fB\-\fR in the list, the simplest
 450 method is to enclose it in \fB[.\fR and \fB.]\fR to make it a
 451 collating element (see below). Alternatively, make it the first
 452 character (following a possible
 453 .QW \fB^\fR ),
 454 or (AREs only) precede it with
 455 .QW \fB\e\fR .
 456 Alternatively, for
 457 .QW \fB\-\fR ,
 458 make it the last character, or the second endpoint of a range. To use
 459 a literal \fB\-\fR as the first endpoint of a range, make it a
 460 collating element or (AREs only) precede it with
 461 .QW \fB\e\fR .
 462 With the exception of
 463 these, some combinations using \fB[\fR (see next paragraphs), and
 464 escapes, all other special characters lose their special significance
 465 within a bracket expression.
 466 .SS "CHARACTER CLASSES"
 467 Within a bracket expression, the name of a \fIcharacter class\fR
 468 enclosed in \fB[:\fR and \fB:]\fR stands for the list of all
 469 characters (not all collating elements!) belonging to that class.
 470 Standard character classes are:
 471 .IP \fBalpha\fR 8
 472 A letter.
 473 .IP \fBupper\fR 8
 474 An upper-case letter.
 475 .IP \fBlower\fR 8
 476 A lower-case letter.
 477 .IP \fBdigit\fR 8
 478 A decimal digit.
 479 .IP \fBxdigit\fR 8
 480 A hexadecimal digit.
 481 .IP \fBalnum\fR 8
 482 An alphanumeric (letter or digit).
 483 .IP \fBprint\fR 8
 484 A "printable" (same as graph, except also including space).
 485 .IP \fBblank\fR 8
 486 A space or tab character.
 487 .IP \fBspace\fR 8
 488 A character producing white space in displayed text.
 489 .IP \fBpunct\fR 8
 490 A punctuation character.
 491 .IP \fBgraph\fR 8
 492 A character with a visible representation (includes both \fBalnum\fR
 493 and \fBpunct\fR).
 494 .IP \fBcntrl\fR 8
 495 A control character.
 496 .PP
 497 A locale may provide others. A character class may not be used as an endpoint
 498 of a range.
 499 .RS
 500 .PP
 501 (\fINote:\fR the current Tcl implementation has only one locale, the Unicode
 502 locale, which supports exactly the above classes.)
 503 .RE
 504 .SS "BRACKETED CONSTRAINTS"
 505 There are two special cases of bracket expressions: the bracket
 506 expressions
 507 .QW \fB[[:<:]]\fR
 508 and
 509 .QW \fB[[:>:]]\fR
 510 are constraints, matching empty strings at the beginning and end of a word
 511 respectively.
 512 .\" note, discussion of escapes below references this definition of word
 513 A word is defined as a sequence of word characters that is neither preceded
 514 nor followed by word characters. A word character is an \fIalnum\fR character
 515 or an underscore
 516 .PQ \fB_\fR "" .
 517 These special bracket expressions are deprecated; users of AREs should use
 518 constraint escapes instead (see below).
 519 .SS "COLLATING ELEMENTS"
 520 Within a bracket expression, a collating element (a character, a
 521 multi-character sequence that collates as if it were a single
 522 character, or a collating-sequence name for either) enclosed in
 523 \fB[.\fR and \fB.]\fR stands for the sequence of characters of that
 524 collating element. The sequence is a single element of the bracket
 525 expression's list. A bracket expression in a locale that has
 526 multi-character collating elements can thus match more than one
 527 character. So (insidiously), a bracket expression that starts with
 528 \fB^\fR can match multi-character collating elements even if none of
 529 them appear in the bracket expression!
 530 .RS
 531 .PP
 532 (\fINote:\fR Tcl has no multi-character collating elements. This information
 533 is only for illustration.)
 534 .RE
 535 .PP
 536 For example, assume the collating sequence includes a \fBch\fR multi-character
 537 collating element. Then the RE
 538 .QW \fB[[.ch.]]*c\fR
 539 (zero or more
 540 .QW \fBch\fRs
 541 followed by
 542 .QW \fBc\fR )
 543 matches the first five characters of
 544 .QW \fBchchcc\fR .
 545 Also, the RE
 546 .QW \fB[^c]b\fR
 547 matches all of
 548 .QW \fBchb\fR
 549 (because
 550 .QW \fB[^c]\fR
 551 matches the multi-character
 552 .QW \fBch\fR ).
 553 .SS "EQUIVALENCE CLASSES"
 554 Within a bracket expression, a collating element enclosed in \fB[=\fR
 555 and \fB=]\fR is an equivalence class, standing for the sequences of
 556 characters of all collating elements equivalent to that one, including
 557 itself. (If there are no other equivalent collating elements, the
 558 treatment is as if the enclosing delimiters were
 559 .QW \fB[.\fR \&
 560 and
 561 .QW \fB.]\fR .)
 562 For example, if \fBo\fR and \fB\*(qo\fR are the members of an
 563 equivalence class, then
 564 .QW \fB[[=o=]]\fR ,
 565 .QW \fB[[=\*(qo=]]\fR ,
 566 and
 567 .QW \fB[o\*(qo]\fR \&
 568 are all synonymous. An equivalence class may not be an endpoint of a range.
 569 .RS
 570 .PP
 571 (\fINote:\fR Tcl implements only the Unicode locale. It does not define any
 572 equivalence classes. The examples above are just illustrations.)
 573 .RE
 574 .SH ESCAPES
 575 Escapes (AREs only), which begin with a \fB\e\fR followed by an
 576 alphanumeric character, come in several varieties: character entry,
 577 class shorthands, constraint escapes, and back references. A \fB\e\fR
 578 followed by an alphanumeric character but not constituting a valid
 579 escape is illegal in AREs. In EREs, there are no escapes: outside a
 580 bracket expression, a \fB\e\fR followed by an alphanumeric character
 581 merely stands for that character as an ordinary character, and inside
 582 a bracket expression, \fB\e\fR is an ordinary character. (The latter
 583 is the one actual incompatibility between EREs and AREs.)
 584 .SS "CHARACTER-ENTRY ESCAPES"
 585 Character-entry escapes (AREs only) exist to make it easier to specify
 586 non-printing and otherwise inconvenient characters in REs:
 587 .RS 2
 588 .TP 5
 589 \fB\ea\fR
 590 .
 591 alert (bell) character, as in C
 592 .TP
 593 \fB\eb\fR
 594 .
 595 backspace, as in C
 596 .TP
 597 \fB\eB\fR
 598 .
 599 synonym for \fB\e\fR to help reduce backslash doubling in some
 600 applications where there are multiple levels of backslash processing
 601 .TP
 602 \fB\ec\fIX\fR
 603 .
 604 (where \fIX\fR is any character) the character whose low-order 5 bits
 605 are the same as those of \fIX\fR, and whose other bits are all zero
 606 .TP
 607 \fB\ee\fR
 608 .
 609 the character whose collating-sequence name is
 610 .QW \fBESC\fR ,
 611 or failing that, the character with octal value 033
 612 .TP
 613 \fB\ef\fR
 614 .
 615 formfeed, as in C
 616 .TP
 617 \fB\en\fR
 618 .
 619 newline, as in C
 620 .TP
 621 \fB\er\fR
 622 .
 623 carriage return, as in C
 624 .TP
 625 \fB\et\fR
 626 .
 627 horizontal tab, as in C
 628 .TP
 629 \fB\eu\fIwxyz\fR
 630 .
 631 (where \fIwxyz\fR is one up to four hexadecimal digits) the Unicode
 632 character \fBU+\fIwxyz\fR in the local byte ordering
 633 .TP
 634 \fB\eU\fIstuvwxyz\fR
 635 .
 636 (where \fIstuvwxyz\fR is one up to eight hexadecimal digits) reserved
 637 for a Unicode extension up to 21 bits. The digits are parsed until the
 638 first non-hexadecimal character is encountered, the maximun of eight
 639 hexadecimal digits are reached, or an overflow would occur in the maximum
 640 value of \fBU+\fI10ffff\fR.
 641 .TP
 642 \fB\ev\fR
 643 .
 644 vertical tab, as in C are all available.
 645 .TP
 646 \fB\ex\fIhh\fR
 647 .
 648 (where \fIhh\fR is one or two hexadecimal digits) the character
 649 whose hexadecimal value is \fB0x\fIhh\fR.
 650 .TP
 651 \fB\e0\fR
 652 .
 653 the character whose value is \fB0\fR
 654 .TP
 655 \fB\e\fIxyz\fR
 656 .
 657 (where \fIxyz\fR is exactly three octal digits, and is not a \fIback
 658 reference\fR (see below)) the character whose octal value is
 659 \fB0\fIxyz\fR. The first digit must be in the range 0-3, otherwise
 660 the two-digit form is assumed.
 661 .TP
 662 \fB\e\fIxy\fR
 663 .
 664 (where \fIxy\fR is exactly two octal digits, and is not a \fIback
 665 reference\fR (see below)) the character whose octal value is
 666 \fB0\fIxy\fR
 667 .RE
 668 .PP
 669 Hexadecimal digits are
 670 .QR \fB0\fR \fB9\fR ,
 671 .QR \fBa\fR \fBf\fR ,
 672 and
 673 .QR \fBA\fR \fBF\fR .
 674 Octal digits are
 675 .QR \fB0\fR \fB7\fR .
 676 .PP
 677 The character-entry escapes are always taken as ordinary characters.
 678 For example, \fB\e135\fR is \fB]\fR in Unicode, but \fB\e135\fR does
 679 not terminate a bracket expression. Beware, however, that some
 680 applications (e.g., C compilers and the Tcl interpreter if the regular
 681 expression is not quoted with braces) interpret such sequences
 682 themselves before the regular-expression package gets to see them,
 683 which may require doubling (quadrupling, etc.) the
 684 .QW \fB\e\fR .
 685 .SS "CLASS-SHORTHAND ESCAPES"
 686 Class-shorthand escapes (AREs only) provide shorthands for certain
 687 commonly-used character classes:
 688 .RS 2
 689 .TP 10
 690 \fB\ed\fR
 691 .
 692 \fB[[:digit:]]\fR
 693 .TP
 694 \fB\es\fR
 695 .
 696 \fB[[:space:]]\fR
 697 .TP
 698 \fB\ew\fR
 699 .
 700 \fB[[:alnum:]_]\fR (note underscore)
 701 .TP
 702 \fB\eD\fR
 703 .
 704 \fB[^[:digit:]]\fR
 705 .TP
 706 \fB\eS\fR
 707 .
 708 \fB[^[:space:]]\fR
 709 .TP
 710 \fB\eW\fR
 711 .
 712 \fB[^[:alnum:]_]\fR (note underscore)
 713 .RE
 714 .PP
 715 Within bracket expressions,
 716 .QW \fB\ed\fR ,
 717 .QW \fB\es\fR ,
 718 and
 719 .QW \fB\ew\fR \&
 720 lose their outer brackets, and
 721 .QW \fB\eD\fR ,
 722 .QW \fB\eS\fR ,
 723 and
 724 .QW \fB\eW\fR \&
 725 are illegal. (So, for example,
 726 .QW \fB[a-c\ed]\fR
 727 is equivalent to
 728 .QW \fB[a-c[:digit:]]\fR .
 729 Also,
 730 .QW \fB[a-c\eD]\fR ,
 731 which is equivalent to
 732 .QW \fB[a-c^[:digit:]]\fR ,
 733 is illegal.)
 734 .SS "CONSTRAINT ESCAPES"
 735 A constraint escape (AREs only) is a constraint, matching the empty
 736 string if specific conditions are met, written as an escape:
 737 .RS 2
 738 .TP 6
 739 \fB\eA\fR
 740 .
 741 matches only at the beginning of the string (see \fBMATCHING\fR,
 742 below, for how this differs from
 743 .QW \fB^\fR )
 744 .TP
 745 \fB\em\fR
 746 .
 747 matches only at the beginning of a word
 748 .TP
 749 \fB\eM\fR
 750 .
 751 matches only at the end of a word
 752 .TP
 753 \fB\ey\fR
 754 .
 755 matches only at the beginning or end of a word
 756 .TP
 757 \fB\eY\fR
 758 .
 759 matches only at a point that is not the beginning or end of a word
 760 .TP
 761 \fB\eZ\fR
 762 .
 763 matches only at the end of the string (see \fBMATCHING\fR, below, for
 764 how this differs from
 765 .QW \fB$\fR )
 766 .TP
 767 \fB\e\fIm\fR
 768 .
 769 (where \fIm\fR is a nonzero digit) a \fIback reference\fR, see below
 770 .TP
 771 \fB\e\fImnn\fR
 772 .
 773 (where \fIm\fR is a nonzero digit, and \fInn\fR is some more digits,
 774 and the decimal value \fImnn\fR is not greater than the number of
 775 closing capturing parentheses seen so far) a \fIback reference\fR, see
 776 below
 777 .RE
 778 .PP
 779 A word is defined as in the specification of
 780 .QW \fB[[:<:]]\fR
 781 and
 782 .QW \fB[[:>:]]\fR
 783 above. Constraint escapes are illegal within bracket expressions.
 784 .SS "BACK REFERENCES"
 785 A back reference (AREs only) matches the same string matched by the
 786 parenthesized subexpression specified by the number, so that (e.g.)
 787 .QW \fB([bc])\e1\fR
 788 matches
 789 .QW \fBbb\fR
 790 or
 791 .QW \fBcc\fR
 792 but not
 793 .QW \fBbc\fR .
 794 The subexpression must entirely precede the back reference in the RE.
 795 Subexpressions are numbered in the order of their leading parentheses.
 796 Non-capturing parentheses do not define subexpressions.
 797 .PP
 798 There is an inherent historical ambiguity between octal
 799 character-entry escapes and back references, which is resolved by
 800 heuristics, as hinted at above. A leading zero always indicates an
 801 octal escape. A single non-zero digit, not followed by another digit,
 802 is always taken as a back reference. A multi-digit sequence not
 803 starting with a zero is taken as a back reference if it comes after a
 804 suitable subexpression (i.e. the number is in the legal range for a
 805 back reference), and otherwise is taken as octal.
 806 .SH "METASYNTAX"
 807 In addition to the main syntax described above, there are some special
 808 forms and miscellaneous syntactic facilities available.
 809 .PP
 810 Normally the flavor of RE being used is specified by
 811 application-dependent means. However, this can be overridden by a
 812 \fIdirector\fR. If an RE of any flavor begins with
 813 .QW \fB***:\fR ,
 814 the rest of the RE is an ARE. If an RE of any flavor begins with
 815 .QW \fB***=\fR ,
 816 the rest of the RE is taken to be a literal string, with
 817 all characters considered ordinary characters.
 818 .PP
 819 An ARE may begin with \fIembedded options\fR: a sequence
 820 \fB(?\fIxyz\fB)\fR (where \fIxyz\fR is one or more alphabetic
 821 characters) specifies options affecting the rest of the RE. These
 822 supplement, and can override, any options specified by the
 823 application. The available option letters are:
 824 .RS 2
 825 .TP 3
 826 \fBb\fR
 827 .
 828 rest of RE is a BRE
 829 .TP 3
 830 \fBc\fR
 831 .
 832 case-sensitive matching (usual default)
 833 .TP 3
 834 \fBe\fR
 835 .
 836 rest of RE is an ERE
 837 .TP 3
 838 \fBi\fR
 839 .
 840 case-insensitive matching (see \fBMATCHING\fR, below)
 841 .TP 3
 842 \fBm\fR
 843 .
 844 historical synonym for \fBn\fR
 845 .TP 3
 846 \fBn\fR
 847 .
 848 newline-sensitive matching (see \fBMATCHING\fR, below)
 849 .TP 3
 850 \fBp\fR
 851 .
 852 partial newline-sensitive matching (see \fBMATCHING\fR, below)
 853 .TP 3
 854 \fBq\fR
 855 .
 856 rest of RE is a literal
 857 .PQ quoted
 858 string, all ordinary characters
 859 .TP 3
 860 \fBs\fR
 861 .
 862 non-newline-sensitive matching (usual default)
 863 .TP 3
 864 \fBt\fR
 865 .
 866 tight syntax (usual default; see below)
 867 .TP 3
 868 \fBw\fR
 869 .
 870 inverse partial newline-sensitive
 871 .PQ weird
 872 matching (see \fBMATCHING\fR, below)
 873 .TP 3
 874 \fBx\fR
 875 .
 876 expanded syntax (see below)
 877 .RE
 878 .PP
 879 Embedded options take effect at the \fB)\fR terminating the sequence.
 880 They are available only at the start of an ARE, and may not be used
 881 later within it.
 882 .PP
 883 In addition to the usual (\fItight\fR) RE syntax, in which all
 884 characters are significant, there is an \fIexpanded\fR syntax,
 885 available in all flavors of RE with the \fB\-expanded\fR switch, or in
 886 AREs with the embedded x option. In the expanded syntax, white-space
 887 characters are ignored and all characters between a \fB#\fR and the
 888 following newline (or the end of the RE) are ignored, permitting
 889 paragraphing and commenting a complex RE. There are three exceptions
 890 to that basic rule:
 891 .IP \(bu 3
 892 a white-space character or
 893 .QW \fB#\fR
 894 preceded by
 895 .QW \fB\e\fR
 896 is retained
 897 .IP \(bu 3
 898 white space or
 899 .QW \fB#\fR
 900 within a bracket expression is retained
 901 .IP \(bu 3
 902 white space and comments are illegal within multi-character symbols
 903 like the ARE
 904 .QW \fB(?:\fR
 905 or the BRE
 906 .QW \fB\e(\fR
 907 .PP
 908 Expanded-syntax white-space characters are blank, tab, newline, and
 909 any character that belongs to the \fIspace\fR character class.
 910 .PP
 911 Finally, in an ARE, outside bracket expressions, the sequence
 912 .QW \fB(?#\fIttt\fB)\fR
 913 (where \fIttt\fR is any text not containing a
 914 .QW \fB)\fR )
 915 is a comment, completely ignored. Again, this is not
 916 allowed between the characters of multi-character symbols like
 917 .QW \fB(?:\fR .
 918 Such comments are more a historical artifact than a useful facility,
 919 and their use is deprecated; use the expanded syntax instead.
 920 .PP
 921 \fINone\fR of these metasyntax extensions is available if the
 922 application (or an initial
 923 .QW \fB***=\fR
 924 director) has specified that the
 925 user's input be treated as a literal string rather than as an RE.
 926 .SH MATCHING
 927 In the event that an RE could match more than one substring of a given
 928 string, the RE matches the one starting earliest in the string. If
 929 the RE could match more than one substring starting at that point, its
 930 choice is determined by its \fIpreference\fR: either the longest
 931 substring, or the shortest.
 932 .PP
 933 Most atoms, and all constraints, have no preference. A parenthesized
 934 RE has the same preference (possibly none) as the RE. A quantified
 935 atom with quantifier \fB{\fIm\fB}\fR or \fB{\fIm\fB}?\fR has the same
 936 preference (possibly none) as the atom itself. A quantified atom with
 937 other normal quantifiers (including \fB{\fIm\fB,\fIn\fB}\fR with
 938 \fIm\fR equal to \fIn\fR) prefers longest match. A quantified atom
 939 with other non-greedy quantifiers (including \fB{\fIm\fB,\fIn\fB}?\fR
 940 with \fIm\fR equal to \fIn\fR) prefers shortest match. A branch has
 941 the same preference as the first quantified atom in it which has a
 942 preference. An RE consisting of two or more branches connected by the
 943 \fB|\fR operator prefers longest match.
 944 .PP
 945 Subject to the constraints imposed by the rules for matching the whole
 946 RE, subexpressions also match the longest or shortest possible
 947 substrings, based on their preferences, with subexpressions starting
 948 earlier in the RE taking priority over ones starting later. Note that
 949 outer subexpressions thus take priority over their component
 950 subexpressions.
 951 .PP
 952 Note that the quantifiers \fB{1,1}\fR and \fB{1,1}?\fR can be used to
 953 force longest and shortest preference, respectively, on a
 954 subexpression or a whole RE.
 955 .PP
 956 Match lengths are measured in characters, not collating elements. An
 957 empty string is considered longer than no match at all. For example,
 958 .QW \fBbb*\fR
 959 matches the three middle characters of
 960 .QW \fBabbbc\fR ,
 961 .QW \fB(week|wee)(night|knights)\fR
 962 matches all ten characters of
 963 .QW \fBweeknights\fR ,
 964 when
 965 .QW \fB(.*).*\fR
 966 is matched against
 967 .QW \fBabc\fR
 968 the parenthesized subexpression matches all three characters, and when
 969 .QW \fB(a*)*\fR
 970 is matched against
 971 .QW \fBbc\fR
 972 both the whole RE and the parenthesized subexpression match an empty string.
 973 .PP
 974 If case-independent matching is specified, the effect is much as if
 975 all case distinctions had vanished from the alphabet. When an
 976 alphabetic that exists in multiple cases appears as an ordinary
 977 character outside a bracket expression, it is effectively transformed
 978 into a bracket expression containing both cases, so that \fBx\fR
 979 becomes
 980 .QW \fB[xX]\fR .
 981 When it appears inside a bracket expression,
 982 all case counterparts of it are added to the bracket expression, so
 983 that
 984 .QW \fB[x]\fR
 985 becomes
 986 .QW \fB[xX]\fR
 987 and
 988 .QW \fB[^x]\fR
 989 becomes
 990 .QW \fB[^xX]\fR .
 991 .PP
 992 If newline-sensitive matching is specified, \fB.\fR and bracket
 993 expressions using \fB^\fR will never match the newline character (so
 994 that matches will never cross newlines unless the RE explicitly
 995 arranges it) and \fB^\fR and \fB$\fR will match the empty string after
 996 and before a newline respectively, in addition to matching at
 997 beginning and end of string respectively. ARE \fB\eA\fR and \fB\eZ\fR
 998 continue to match beginning or end of string \fIonly\fR.
 999 .PP
1000 If partial newline-sensitive matching is specified, this affects
1001 \fB.\fR and bracket expressions as with newline-sensitive matching,
1002 but not \fB^\fR and \fB$\fR.
1003 .PP
1004 If inverse partial newline-sensitive matching is specified, this
1005 affects \fB^\fR and \fB$\fR as with newline-sensitive matching, but
1006 not \fB.\fR and bracket expressions. This is not very useful but is
1007 provided for symmetry.
1008 .SH "LIMITS AND COMPATIBILITY"
1009 No particular limit is imposed on the length of REs. Programs
1010 intended to be highly portable should not employ REs longer than 256
1011 bytes, as a POSIX-compliant implementation can refuse to accept such
1012 REs.
1013 .PP
1014 The only feature of AREs that is actually incompatible with POSIX EREs
1015 is that \fB\e\fR does not lose its special significance inside bracket
1016 expressions. All other ARE features use syntax which is illegal or
1017 has undefined or unspecified effects in POSIX EREs; the \fB***\fR
1018 syntax of directors likewise is outside the POSIX syntax for both BREs
1019 and EREs.
1020 .PP
1021 Many of the ARE extensions are borrowed from Perl, but some have been
1022 changed to clean them up, and a few Perl extensions are not present.
1023 Incompatibilities of note include
1024 .QW \fB\eb\fR ,
1025 .QW \fB\eB\fR ,
1026 the lack of special treatment for a trailing newline, the addition of
1027 complemented bracket expressions to the things affected by
1028 newline-sensitive matching, the restrictions on parentheses and back
1029 references in lookahead constraints, and the longest/shortest-match
1030 (rather than first-match) matching semantics.
1031 .PP
1032 The matching rules for REs containing both normal and non-greedy
1033 quantifiers have changed since early beta-test versions of this
1034 package. (The new rules are much simpler and cleaner, but do not work
1035 as hard at guessing the user's real intentions.)
1036 .PP
1037 Henry Spencer's original 1986 \fIregexp\fR package, still in
1038 widespread use (e.g., in pre-8.1 releases of Tcl), implemented an
1039 early version of today's EREs. There are four incompatibilities
1040 between \fIregexp\fR's near-EREs
1041 .PQ RREs " for short"
1042 and AREs. In roughly increasing order of significance:
1043 .IP \(bu 3
1044 In AREs, \fB\e\fR followed by an alphanumeric character is either an
1045 escape or an error, while in RREs, it was just another way of writing
1046 the alphanumeric. This should not be a problem because there was no
1047 reason to write such a sequence in RREs.
1048 .IP \(bu 3
1049 \fB{\fR followed by a digit in an ARE is the beginning of a bound,
1050 while in RREs, \fB{\fR was always an ordinary character. Such
1051 sequences should be rare, and will often result in an error because
1052 following characters will not look like a valid bound.
1053 .IP \(bu 3
1054 In AREs, \fB\e\fR remains a special character within
1055 .QW \fB[\|]\fR ,
1056 so a literal \fB\e\fR within \fB[\|]\fR must be written
1057 .QW \fB\e\e\fR .
1058 \fB\e\e\fR also gives a literal \fB\e\fR within \fB[\|]\fR in RREs,
1059 but only truly paranoid programmers routinely doubled the backslash.
1060 .IP \(bu 3
1061 AREs report the longest/shortest match for the RE, rather than the
1062 first found in a specified search order. This may affect some RREs
1063 which were written in the expectation that the first match would be
1064 reported. (The careful crafting of RREs to optimize the search order
1065 for fast matching is obsolete (AREs examine all possible matches in
1066 parallel, and their performance is largely insensitive to their
1067 complexity) but cases where the search order was exploited to
1068 deliberately find a match which was \fInot\fR the longest/shortest
1069 will need rewriting.)
1070 .SH "BASIC REGULAR EXPRESSIONS"
1071 BREs differ from EREs in several respects.
1072 .QW \fB|\fR ,
1073 .QW \fB+\fR ,
1074 and \fB?\fR are ordinary characters and there is no equivalent for their
1075 functionality. The delimiters for bounds are \fB\e{\fR and
1076 .QW \fB\e}\fR ,
1077 with \fB{\fR and \fB}\fR by themselves ordinary characters. The
1078 parentheses for nested subexpressions are \fB\e(\fR and
1079 .QW \fB\e)\fR ,
1080 with \fB(\fR and \fB)\fR by themselves ordinary
1081 characters. \fB^\fR is an ordinary character except at the beginning
1082 of the RE or the beginning of a parenthesized subexpression, \fB$\fR
1083 is an ordinary character except at the end of the RE or the end of a
1084 parenthesized subexpression, and \fB*\fR is an ordinary character if
1085 it appears at the beginning of the RE or the beginning of a
1086 parenthesized subexpression (after a possible leading
1087 .QW \fB^\fR ).
1088 Finally, single-digit back references are available, and \fB\e<\fR and
1089 \fB\e>\fR are synonyms for
1090 .QW \fB[[:<:]]\fR
1091 and
1092 .QW \fB[[:>:]]\fR
1093 respectively; no other escapes are available.
1094 .SH "SEE ALSO"
1095 RegExp(3), regexp(n), regsub(n), lsearch(n), switch(n), text(n)
1096 .SH KEYWORDS
1097 match, regular expression, string
1098 .\" Local Variables:
1099 .\" mode: nroff
1100 .\" End: