GPL_bin_fullset/NaturalDocs/Modules/NaturalDocs/Parser/Native.pm

   1 ###############################################################################
   2 #
   3 #   Package: NaturalDocs::Parser::Native
   4 #
   5 ###############################################################################
   6 #
   7 #   A package that converts comments from Natural Docs' native format into <NaturalDocs::Parser::ParsedTopic> objects.
   8 #   Unlike most second-level packages, these are packages and not object classes.
   9 #
  10 ###############################################################################
  11
  12 # This file is part of Natural Docs, which is Copyright © 2003-2010 Greg Valure
  13 # Natural Docs is licensed under version 3 of the GNU Affero General Public License (AGPL)
  14 # Refer to License.txt for the complete details
  15
  16
  17 use strict;
  18 use integer;
  19
  20 package NaturalDocs::Parser::Native;
  21
  22
  23 ###############################################################################
  24 # Group: Variables
  25
  26
  27 # Return values of TagType().  Not documented here.
  28 use constant POSSIBLE_OPENING_TAG => 1;
  29 use constant POSSIBLE_CLOSING_TAG => 2;
  30 use constant NOT_A_TAG => 3;
  31
  32
  33 #
  34 #   var: package
  35 #
  36 #   A <SymbolString> representing the package normal topics will be a part of at the current point in the file.  This is a package variable
  37 #   because it needs to be reserved between function calls.
  38 #
  39 my $package;
  40
  41 #
  42 #   hash: functionListIgnoredHeadings
  43 #
  44 #   An existence hash of all the headings that prevent the parser from creating function list symbols.  Whenever one of
  45 #   these headings are used in a function list topic, symbols are not created from definition lists until the next heading.  The keys
  46 #   are in all lowercase.
  47 #
  48 my %functionListIgnoredHeadings = ( 'parameters' => 1,
  49                                                        'parameter' => 1,
  50                                                        'params' => 1,
  51                                                        'param' => 1,
  52                                                        'arguments' => 1,
  53                                                        'argument' => 1,
  54                                                        'args' => 1,
  55                                                        'arg' => 1 );
  56
  57
  58 ###############################################################################
  59 # Group: Interface Functions
  60
  61
  62 #
  63 #   Function: Start
  64 #
  65 #   This will be called whenever a file is about to be parsed.  It allows the package to reset its internal state.
  66 #
  67 sub Start
  68     {
  69     my ($self) = @_;
  70     $package = undef;
  71     };
  72
  73
  74 #
  75 #   Function: IsMine
  76 #
  77 #   Examines the comment and returns whether it is *definitely* Natural Docs content, i.e. it is owned by this package.  Note
  78 #   that a comment can fail this function and still be interpreted as a Natural Docs content, for example a JavaDoc-styled comment
  79 #   that doesn't have header lines but no JavaDoc tags either.
  80 #
  81 #   Parameters:
  82 #
  83 #       commentLines - An arrayref of the comment lines.  Must have been run through <NaturalDocs::Parser->CleanComment()>.
  84 #       isJavaDoc - Whether the comment was JavaDoc-styled.
  85 #
  86 #   Returns:
  87 #
  88 #       Whether the comment is *definitely* Natural Docs content.
  89 #
  90 sub IsMine #(string[] commentLines, bool isJavaDoc)
  91     {
  92     my ($self, $commentLines, $isJavaDoc) = @_;
  93
  94     # Skip to the first line with content.
  95     my $line = 0;
  96
  97     while ($line < scalar @$commentLines && !length $commentLines->[$line])
  98         {  $line++;  };
  99
 100     return $self->ParseHeaderLine($commentLines->[$line]);
 101     };
 102
 103
 104
 105 #
 106 #   Function: ParseComment
 107 #
 108 #   This will be called whenever a comment capable of containing Natural Docs content is found.
 109 #
 110 #   Parameters:
 111 #
 112 #       commentLines - An arrayref of the comment lines.  Must have been run through <NaturalDocs::Parser->CleanComment()>.
 113 #                               *The original memory will be changed.*
 114 #       isJavaDoc - Whether the comment is JavaDoc styled.
 115 #       lineNumber - The line number of the first of the comment lines.
 116 #       parsedTopics - A reference to the array where any new <NaturalDocs::Parser::ParsedTopics> should be placed.
 117 #
 118 #   Returns:
 119 #
 120 #       The number of parsed topics added to the array, or zero if none.
 121 #
 122 sub ParseComment #(commentLines, isJavaDoc, lineNumber, parsedTopics)
 123     {
 124     my ($self, $commentLines, $isJavaDoc, $lineNumber, $parsedTopics) = @_;
 125
 126     my $topicCount = 0;
 127     my $prevLineBlank = 1;
 128     my $inCodeSection = 0;
 129
 130     my ($type, $scope, $isPlural, $title, $symbol);
 131     #my $package;  # package variable.
 132     my ($newKeyword, $newTitle);
 133
 134     my $index = 0;
 135
 136     my $bodyStart = 0;
 137     my $bodyEnd = 0;  # Not inclusive.
 138
 139     while ($index < scalar @$commentLines)
 140         {
 141         # Everything but leading whitespace was removed beforehand.
 142
 143         # If we're in a code section...
 144         if ($inCodeSection)
 145             {
 146             if ($commentLines->[$index] =~ /^ *\( *(?:end|finish|done)(?: +(?:table|code|example|diagram))? *\)$/i)
 147                 {  $inCodeSection = undef;  };
 148
 149             $prevLineBlank = 0;
 150             $bodyEnd++;
 151             }
 152
 153         # If the line is empty...
 154         elsif (!length($commentLines->[$index]))
 155             {
 156             $prevLineBlank = 1;
 157
 158             if ($topicCount)
 159                 {  $bodyEnd++;  };
 160             }
 161
 162         # If the line has a recognized header and the previous line is blank...
 163         elsif ($prevLineBlank && (($newKeyword, $newTitle) = $self->ParseHeaderLine($commentLines->[$index])) )
 164             {
 165             # Process the previous one, if any.
 166
 167             if ($topicCount)
 168                 {
 169                 if ($scope == ::SCOPE_START() || $scope == ::SCOPE_END())
 170                     {  $package = undef;  };
 171
 172                 my $body = $self->FormatBody($commentLines, $bodyStart, $bodyEnd, $type, $isPlural);
 173                 my $newTopic = $self->MakeParsedTopic($type, $title, $package, $body, $lineNumber + $bodyStart - 1, $isPlural);
 174                 push @$parsedTopics, $newTopic;
 175
 176                 $package = $newTopic->Package();
 177                 };
 178
 179             $title = $newTitle;
 180
 181             my $typeInfo;
 182             ($type, $typeInfo, $isPlural) = NaturalDocs::Topics->KeywordInfo($newKeyword);
 183             $scope = $typeInfo->Scope();
 184
 185             $bodyStart = $index + 1;
 186             $bodyEnd = $index + 1;
 187
 188             $topicCount++;
 189
 190             $prevLineBlank = 0;
 191             }
 192
 193         # If we're on a non-empty, non-header line of a JavaDoc-styled comment and we haven't started a topic yet...
 194         elsif ($isJavaDoc && !$topicCount)
 195             {
 196             $type = undef;
 197             $scope = ::SCOPE_NORMAL();  # The scope repair and topic merging processes will handle if this is a class topic.
 198             $isPlural = undef;
 199             $title = undef;
 200             $symbol = undef;
 201
 202             $bodyStart = $index;
 203             $bodyEnd = $index + 1;
 204
 205             $topicCount++;
 206
 207             $prevLineBlank = undef;
 208             }
 209
 210         # If we're on a normal content line within a topic
 211         elsif ($topicCount)
 212             {
 213             $prevLineBlank = 0;
 214             $bodyEnd++;
 215
 216             if ($commentLines->[$index] =~ /^ *\( *(?:(?:start|begin)? +)?(?:table|code|example|diagram) *\)$/i)
 217                 {  $inCodeSection = 1;  };
 218             };
 219
 220
 221         $index++;
 222         };
 223
 224
 225     # Last one, if any.  This is the only one that gets the prototypes.
 226     if ($topicCount)
 227         {
 228         if ($scope == ::SCOPE_START() || $scope == ::SCOPE_END())
 229             {  $package = undef;  };
 230
 231         my $body = $self->FormatBody($commentLines, $bodyStart, $bodyEnd, $type, $isPlural);
 232         my $newTopic = $self->MakeParsedTopic($type, $title, $package, $body, $lineNumber + $bodyStart - 1, $isPlural);
 233         push @$parsedTopics, $newTopic;
 234         $topicCount++;
 235
 236         $package = $newTopic->Package();
 237         };
 238
 239     return $topicCount;
 240     };
 241
 242
 243 #
 244 #   Function: ParseHeaderLine
 245 #
 246 #   If the passed line is a topic header, returns the array ( keyword, title ).  Otherwise returns an empty array.
 247 #
 248 sub ParseHeaderLine #(line)
 249     {
 250     my ($self, $line) = @_;
 251
 252     if ($line =~ /^ *([a-z0-9 ]*[a-z0-9]): +(.*)$/i)
 253         {
 254         my ($keyword, $title) = ($1, $2);
 255
 256         # We need to do it this way because if you do "if (ND:T->KeywordInfo($keyword)" and the last element of the array it
 257         # returns is false, the statement is false.  That is really retarded, but there it is.
 258         my ($type, undef, undef) = NaturalDocs::Topics->KeywordInfo($keyword);
 259
 260         if ($type)
 261             {  return ($keyword, $title);  }
 262         else
 263             {  return ( );  };
 264         }
 265     else
 266         {  return ( );  };
 267     };
 268
 269
 270
 271 ###############################################################################
 272 # Group: Support Functions
 273
 274
 275 #
 276 #   Function: MakeParsedTopic
 277 #
 278 #   Creates a <NaturalDocs::Parser::ParsedTopic> object for the passed parameters.  Scope is gotten from
 279 #   the package variable <package> instead of from the parameters.  The summary is generated from the body.
 280 #
 281 #   Parameters:
 282 #
 283 #       type         - The <TopicType>.  May be undef for headerless topics.
 284 #       title          - The title of the topic.  May be undef for headerless topics.
 285 #       package    - The package <SymbolString> the topic appears in.
 286 #       body        - The topic's body in <NDMarkup>.
 287 #       lineNumber - The topic's line number.
 288 #       isList         - Whether the topic is a list.
 289 #
 290 #   Returns:
 291 #
 292 #       The <NaturalDocs::Parser::ParsedTopic> object.
 293 #
 294 sub MakeParsedTopic #(type, title, package, body, lineNumber, isList)
 295     {
 296     my ($self, $type, $title, $package, $body, $lineNumber, $isList) = @_;
 297
 298     my $summary;
 299
 300     if (defined $body)
 301         {  $summary = NaturalDocs::Parser->GetSummaryFromBody($body);  };
 302
 303     return NaturalDocs::Parser::ParsedTopic->New($type, $title, $package, undef, undef, $summary,
 304                                                                          $body, $lineNumber, $isList);
 305     };
 306
 307
 308 #
 309 #    Function: FormatBody
 310 #
 311 #    Converts the section body to <NDMarkup>.
 312 #
 313 #    Parameters:
 314 #
 315 #       commentLines - The arrayref of comment lines.
 316 #       startingIndex  - The starting index of the body to format.
 317 #       endingIndex   - The ending index of the body to format, *not* inclusive.
 318 #       type               - The type of the section.  May be undef for headerless comments.
 319 #       isList              - Whether it's a list topic.
 320 #
 321 #    Returns:
 322 #
 323 #        The body formatted in <NDMarkup>.
 324 #
 325 sub FormatBody #(commentLines, startingIndex, endingIndex, type, isList)
 326     {
 327     my ($self, $commentLines, $startingIndex, $endingIndex, $type, $isList) = @_;
 328
 329     use constant TAG_NONE => 1;
 330     use constant TAG_PARAGRAPH => 2;
 331     use constant TAG_BULLETLIST => 3;
 332     use constant TAG_DESCRIPTIONLIST => 4;
 333     use constant TAG_HEADING => 5;
 334     use constant TAG_PREFIXCODE => 6;
 335     use constant TAG_TAGCODE => 7;
 336
 337     my %tagEnders = ( TAG_NONE() => '',
 338                                  TAG_PARAGRAPH() => '</p>',
 339                                  TAG_BULLETLIST() => '</li></ul>',
 340                                  TAG_DESCRIPTIONLIST() => '</dd></dl>',
 341                                  TAG_HEADING() => '</h>',
 342                                  TAG_PREFIXCODE() => '</code>',
 343                                  TAG_TAGCODE() => '</code>' );
 344
 345     my $topLevelTag = TAG_NONE;
 346
 347     my $output;
 348     my $textBlock;
 349     my $prevLineBlank = 1;
 350
 351     my $codeBlock;
 352     my $removedCodeSpaces;
 353
 354     my $ignoreListSymbols;
 355
 356     my $index = $startingIndex;
 357
 358     while ($index < $endingIndex)
 359         {
 360         # If we're in a tagged code section...
 361         if ($topLevelTag == TAG_TAGCODE)
 362             {
 363             if ($commentLines->[$index] =~ /^ *\( *(?:end|finish|done)(?: +(?:table|code|example|diagram))? *\)$/i)
 364                 {
 365                 $codeBlock =~ s/\n+$//;
 366                 $output .= NaturalDocs::NDMarkup->ConvertAmpChars($codeBlock) . '</code>';
 367                 $codeBlock = undef;
 368                 $topLevelTag = TAG_NONE;
 369                 $prevLineBlank = undef;
 370                 }
 371             else
 372                 {
 373                 $self->AddToCodeBlock($commentLines->[$index], \$codeBlock, \$removedCodeSpaces);
 374                 };
 375             }
 376
 377         # If the line starts with a code designator...
 378         elsif ($commentLines->[$index] =~ /^ *[>:|](.*)$/)
 379             {
 380             my $code = $1;
 381
 382             if ($topLevelTag == TAG_PREFIXCODE)
 383                 {
 384                 $self->AddToCodeBlock($code, \$codeBlock, \$removedCodeSpaces);
 385                 }
 386             else # $topLevelTag != TAG_PREFIXCODE
 387                 {
 388                 if (defined $textBlock)
 389                     {
 390                     $output .= $self->RichFormatTextBlock($textBlock) . $tagEnders{$topLevelTag};
 391                     $textBlock = undef;
 392                     };
 393
 394                 $topLevelTag = TAG_PREFIXCODE;
 395                 $output .= '<code type="anonymous">';
 396                 $self->AddToCodeBlock($code, \$codeBlock, \$removedCodeSpaces);
 397                 };
 398             }
 399
 400         # If we're not in either code style...
 401         else
 402             {
 403             # Strip any leading whitespace.
 404             $commentLines->[$index] =~ s/^ +//;
 405
 406             # If we were in a prefixed code section...
 407             if ($topLevelTag == TAG_PREFIXCODE)
 408                 {
 409                 $codeBlock =~ s/\n+$//;
 410                 $output .= NaturalDocs::NDMarkup->ConvertAmpChars($codeBlock) . '</code>';
 411                 $codeBlock = undef;
 412                 $topLevelTag = TAG_NONE;
 413                 $prevLineBlank = undef;
 414                 };
 415
 416
 417             # If the line is blank...
 418             if (!length($commentLines->[$index]))
 419                 {
 420                 # End a paragraph.  Everything else ignores it for now.
 421                 if ($topLevelTag == TAG_PARAGRAPH)
 422                     {
 423                     $output .= $self->RichFormatTextBlock($textBlock) . '</p>';
 424                     $textBlock = undef;
 425                     $topLevelTag = TAG_NONE;
 426                     };
 427
 428                 $prevLineBlank = 1;
 429                 }
 430
 431             # If the line starts with a bullet...
 432             elsif ($commentLines->[$index] =~ /^[-\*o+] +([^ ].*)$/ &&
 433                     substr($1, 0, 2) ne '- ')  # Make sure "o - Something" is a definition, not a bullet.
 434                 {
 435                 my $bulletedText = $1;
 436
 437                 if (defined $textBlock)
 438                     {  $output .= $self->RichFormatTextBlock($textBlock);  };
 439
 440                 if ($topLevelTag == TAG_BULLETLIST)
 441                     {
 442                     $output .= '</li><li>';
 443                     }
 444                 else #($topLevelTag != TAG_BULLETLIST)
 445                     {
 446                     $output .= $tagEnders{$topLevelTag} . '<ul><li>';
 447                     $topLevelTag = TAG_BULLETLIST;
 448                     };
 449
 450                 $textBlock = $bulletedText;
 451
 452                 $prevLineBlank = undef;
 453                 }
 454
 455             # If the line looks like a description list entry...
 456             elsif ($commentLines->[$index] =~ /^(.+?) +- +([^ ].*)$/ && $topLevelTag != TAG_PARAGRAPH)
 457                 {
 458                 my $entry = $1;
 459                 my $description = $2;
 460
 461                 if (defined $textBlock)
 462                     {  $output .= $self->RichFormatTextBlock($textBlock);  };
 463
 464                 if ($topLevelTag == TAG_DESCRIPTIONLIST)
 465                     {
 466                     $output .= '</dd>';
 467                     }
 468                 else #($topLevelTag != TAG_DESCRIPTIONLIST)
 469                     {
 470                     $output .= $tagEnders{$topLevelTag} . '<dl>';
 471                     $topLevelTag = TAG_DESCRIPTIONLIST;
 472                     };
 473
 474                 if (($isList && !$ignoreListSymbols) || $type eq ::TOPIC_ENUMERATION())
 475                     {
 476                     $output .= '<ds>' . NaturalDocs::NDMarkup->ConvertAmpChars($entry) . '</ds><dd>';
 477                     }
 478                 else
 479                     {
 480                     $output .= '<de>' . NaturalDocs::NDMarkup->ConvertAmpChars($entry) . '</de><dd>';
 481                     };
 482
 483                 $textBlock = $description;
 484
 485                 $prevLineBlank = undef;
 486                 }
 487
 488             # If the line could be a header...
 489             elsif ($prevLineBlank && $commentLines->[$index] =~ /^(.*)([^ ]):$/)
 490                 {
 491                 my $headerText = $1 . $2;
 492
 493                 if (defined $textBlock)
 494                     {
 495                     $output .= $self->RichFormatTextBlock($textBlock);
 496                     $textBlock = undef;
 497                     }
 498
 499                 $output .= $tagEnders{$topLevelTag};
 500                 $topLevelTag = TAG_NONE;
 501
 502                 $output .= '<h>' . $self->RichFormatTextBlock($headerText) . '</h>';
 503
 504                 if ($type eq ::TOPIC_FUNCTION() && $isList)
 505                     {
 506                     $ignoreListSymbols = exists $functionListIgnoredHeadings{lc($headerText)};
 507                     };
 508
 509                 $prevLineBlank = undef;
 510                 }
 511
 512             # If the line looks like a code tag...
 513             elsif ($commentLines->[$index] =~ /^\( *(?:(?:start|begin)? +)?(table|code|example|diagram) *\)$/i)
 514                 {
 515                                 my $codeType = lc($1);
 516
 517                 if (defined $textBlock)
 518                     {
 519                     $output .= $self->RichFormatTextBlock($textBlock);
 520                     $textBlock = undef;
 521                     };
 522
 523                 if ($codeType eq 'example')
 524                         {  $codeType = 'anonymous';  }
 525                 elsif ($codeType eq 'table' || $codeType eq 'diagram')
 526                         {  $codeType = 'text';  }
 527                 # else leave it 'code'
 528
 529                 $output .= $tagEnders{$topLevelTag} . '<code type="' . $codeType . '">';
 530                 $topLevelTag = TAG_TAGCODE;
 531                 }
 532
 533             # If the line looks like an inline image...
 534             elsif ($commentLines->[$index] =~ /^(\( *see +)([^\)]+?)( *\))$/i)
 535                 {
 536                 if (defined $textBlock)
 537                     {
 538                     $output .= $self->RichFormatTextBlock($textBlock);
 539                     $textBlock = undef;
 540                     };
 541
 542                 $output .= $tagEnders{$topLevelTag};
 543                 $topLevelTag = TAG_NONE;
 544
 545                 $output .= '<img mode="inline" target="' . NaturalDocs::NDMarkup->ConvertAmpChars($2) . '" '
 546                                 . 'original="' . NaturalDocs::NDMarkup->ConvertAmpChars($1 . $2 . $3) . '">';
 547
 548                 $prevLineBlank = undef;
 549                 }
 550
 551             # If the line isn't any of those, we consider it normal text.
 552             else
 553                 {
 554                 # A blank line followed by normal text ends lists.  We don't handle this when we detect if the line's blank because
 555                 # we don't want blank lines between list items to break the list.
 556                 if ($prevLineBlank && ($topLevelTag == TAG_BULLETLIST || $topLevelTag == TAG_DESCRIPTIONLIST))
 557                     {
 558                     $output .= $self->RichFormatTextBlock($textBlock) . $tagEnders{$topLevelTag} . '<p>';
 559
 560                     $topLevelTag = TAG_PARAGRAPH;
 561                     $textBlock = undef;
 562                     }
 563
 564                 elsif ($topLevelTag == TAG_NONE)
 565                     {
 566                     $output .= '<p>';
 567                     $topLevelTag = TAG_PARAGRAPH;
 568                     # textBlock will already be undef.
 569                     };
 570
 571                 if (defined $textBlock)
 572                     {  $textBlock .= ' ';  };
 573
 574                 $textBlock .= $commentLines->[$index];
 575
 576                 $prevLineBlank = undef;
 577                 };
 578             };
 579
 580         $index++;
 581         };
 582
 583     # Clean up anything left dangling.
 584     if (defined $textBlock)
 585         {
 586         $output .= $self->RichFormatTextBlock($textBlock) . $tagEnders{$topLevelTag};
 587         }
 588     elsif (defined $codeBlock)
 589         {
 590         $codeBlock =~ s/\n+$//;
 591         $output .= NaturalDocs::NDMarkup->ConvertAmpChars($codeBlock) . '</code>';
 592         };
 593
 594     return $output;
 595     };
 596
 597
 598 #
 599 #   Function: AddToCodeBlock
 600 #
 601 #   Adds a line of text to a code block, handling all the indentation processing required.
 602 #
 603 #   Parameters:
 604 #
 605 #       line - The line of text to add.
 606 #       codeBlockRef - A reference to the code block to add it to.
 607 #       removedSpacesRef - A reference to a variable to hold the number of spaces removed.  It needs to be stored between calls.
 608 #                                      It will reset itself automatically when the code block codeBlockRef points to is undef.
 609 #
 610 sub AddToCodeBlock #(line, codeBlockRef, removedSpacesRef)
 611     {
 612     my ($self, $line, $codeBlockRef, $removedSpacesRef) = @_;
 613
 614     $line =~ /^( *)(.*)$/;
 615     my ($spaces, $code) = ($1, $2);
 616
 617     if (!defined $$codeBlockRef)
 618         {
 619         if (length($code))
 620             {
 621             $$codeBlockRef = $code . "\n";
 622             $$removedSpacesRef = length($spaces);
 623             };
 624         # else ignore leading line breaks.
 625         }
 626
 627     elsif (length $code)
 628         {
 629         # Make sure we have the minimum amount of spaces to the left possible.
 630         if (length($spaces) != $$removedSpacesRef)
 631             {
 632             my $spaceDifference = abs( length($spaces) - $$removedSpacesRef );
 633             my $spacesToAdd = ' ' x $spaceDifference;
 634
 635             if (length($spaces) > $$removedSpacesRef)
 636                 {
 637                 $$codeBlockRef .= $spacesToAdd;
 638                 }
 639             else
 640                 {
 641                 $$codeBlockRef =~ s/^(.)/$spacesToAdd . $1/gme;
 642                 $$removedSpacesRef = length($spaces);
 643                 };
 644             };
 645
 646         $$codeBlockRef .= $code . "\n";
 647         }
 648
 649     else # (!length $code)
 650         {
 651         $$codeBlockRef .= "\n";
 652         };
 653     };
 654
 655
 656 #
 657 #   Function: RichFormatTextBlock
 658 #
 659 #   Applies rich <NDMarkup> formatting to a chunk of text.  This includes both amp chars, formatting tags, and link tags.
 660 #
 661 #   Parameters:
 662 #
 663 #       text - The block of text to format.
 664 #
 665 #   Returns:
 666 #
 667 #       The formatted text block.
 668 #
 669 sub RichFormatTextBlock #(text)
 670     {
 671     my ($self, $text) = @_;
 672     my $output;
 673
 674
 675     # First find bare urls, e-mail addresses, and images.  We have to do this before the split because they may contain underscores
 676     # or asterisks.  We have to mark the tags with \x1E and \x1F so they don't get confused with angle brackets from the comment.
 677     # We can't convert the amp chars beforehand because we need lookbehinds in the regexps below and they need to be
 678     # constant length.  Sucks, huh?
 679
 680     $text =~ s{
 681                        # The previous character can't be an alphanumeric or an opening angle bracket.
 682                        (?<!  [a-z0-9<]  )
 683
 684                        # Optional mailto:.  Ignored in output.
 685                        (?:mailto\:)?
 686
 687                        # Begin capture
 688                        (
 689
 690                        # The user portion.  Alphanumeric and - _.  Dots can appear between, but not at the edges or more than
 691                        # one in a row.
 692                        (?:  [a-z0-9\-_]+  \.  )*   [a-z0-9\-_]+
 693
 694                        @
 695
 696                        # The domain.  Alphanumeric and -.  Dots same as above, however, there must be at least two sections
 697                        # and the last one must be two to four alphanumeric characters (.com, .uk, .info, .203 for IP addresses)
 698                        (?:  [a-z0-9\-]+  \.  )+  [a-z]{2,4}
 699
 700                        # End capture.
 701                        )
 702
 703                        # The next character can't be an alphanumeric, which should prevent .abcde from matching the two to
 704                        # four character requirement, or a closing angle bracket.
 705                        (?!  [a-z0-9>]  )
 706
 707                        }
 708
 709                        {"\x1E" . 'email target="' . NaturalDocs::NDMarkup->ConvertAmpChars($1) . '" '
 710                        . 'name="' . NaturalDocs::NDMarkup->ConvertAmpChars($1) . '"' . "\x1F"}igxe;
 711
 712     $text =~ s{
 713                        # The previous character can't be an alphanumeric or an opening angle bracket.
 714                        (?<!  [a-z0-9<]  )
 715
 716                        # Begin capture.
 717                        (
 718
 719                        # URL must start with one of the acceptable protocols.
 720                        (?:http|https|ftp|news|file)\:
 721
 722                        # The acceptable URL characters as far as I know.
 723                        [a-z0-9\-\=\~\@\#\%\&\_\+\/\;\:\?\*\.\,]*
 724
 725                        # The URL characters minus period and comma.  If it ends on them, they're probably intended as
 726                        # punctuation.
 727                        [a-z0-9\-\=\~\@\#\%\&\_\+\/\;\:\?\*]
 728
 729                        # End capture.
 730                        )
 731
 732                        # The next character must not be an acceptable character or a closing angle bracket.  It must also not be a
 733                                            # dot and then an acceptable character.  These will prevent the URL from ending early just to get a match.
 734                        (?!  \.?[a-z0-9\-\=\~\@\#\%\&\_\+\/\;\:\?\*\>]  )
 735
 736                        }
 737
 738                        {"\x1E" . 'url target="' . NaturalDocs::NDMarkup->ConvertAmpChars($1) . '" '
 739                        . 'name="' . NaturalDocs::NDMarkup->ConvertAmpChars($1) . '"' . "\x1F"}igxe;
 740
 741
 742     # Find image links.  Inline images should already be pulled out by now.
 743
 744     $text =~ s{(\( *see +)([^\)\<\>]+?)( *\))}
 745                       {"\x1E" . 'img mode="link" target="' . NaturalDocs::NDMarkup->ConvertAmpChars($2) . '" '
 746                         . 'original="' . NaturalDocs::NDMarkup->ConvertAmpChars($1 . $2 . $3) . '"' . "\x1F"}gie;
 747
 748
 749
 750     # Split the text from the potential tags.
 751
 752     my @tempTextBlocks = split(/([\*_<>\x1E\x1F])/, $text);
 753
 754     # Since the symbols are considered dividers, empty strings could appear between two in a row or at the beginning/end of the
 755     # array.  This could seriously screw up TagType(), so we need to get rid of them.
 756     my @textBlocks;
 757
 758     while (scalar @tempTextBlocks)
 759         {
 760         my $tempTextBlock = shift @tempTextBlocks;
 761
 762         if (length $tempTextBlock)
 763             {  push @textBlocks, $tempTextBlock;  };
 764         };
 765
 766
 767     my $bold;
 768     my $underline;
 769     my $underlineHasWhitespace;
 770
 771     my $index = 0;
 772
 773     while ($index < scalar @textBlocks)
 774         {
 775         if ($textBlocks[$index] eq "\x1E")
 776             {
 777             $output .= '<';
 778             $index++;
 779
 780             while ($textBlocks[$index] ne "\x1F")
 781                 {
 782                 $output .= $textBlocks[$index];
 783                 $index++;
 784                 };
 785
 786             $output .= '>';
 787             }
 788
 789         elsif ($textBlocks[$index] eq '<' && $self->TagType(\@textBlocks, $index) == POSSIBLE_OPENING_TAG)
 790             {
 791             my $endingIndex = $self->ClosingTag(\@textBlocks, $index, undef);
 792
 793             if ($endingIndex != -1)
 794                 {
 795                 my $linkText;
 796                 $index++;
 797
 798                 while ($index < $endingIndex)
 799                     {
 800                     $linkText .= $textBlocks[$index];
 801                     $index++;
 802                     };
 803                 # Index will be incremented again at the end of the loop.
 804
 805                 $linkText = NaturalDocs::NDMarkup->ConvertAmpChars($linkText);
 806
 807                 if ($linkText =~ /^(?:mailto\:)?((?:[a-z0-9\-_]+\.)*[a-z0-9\-_]+@(?:[a-z0-9\-]+\.)+[a-z]{2,4})$/i)
 808                     {  $output .= '<email target="' . $1 . '" name="' . $1 . '">';  }
 809                 elsif ($linkText =~ /^(.+?) at (?:mailto\:)?((?:[a-z0-9\-_]+\.)*[a-z0-9\-_]+@(?:[a-z0-9\-]+\.)+[a-z]{2,4})$/i)
 810                     {  $output .= '<email target="' . $2 . '" name="' . $1 . '">';  }
 811                 elsif ($linkText =~ /^(?:http|https|ftp|news|file)\:/i)
 812                     {  $output .= '<url target="' . $linkText . '" name="' . $linkText . '">';  }
 813                 elsif ($linkText =~ /^(.+?) at ((?:http|https|ftp|news|file)\:.+)/i)
 814                     {  $output .= '<url target="' . $2 . '" name="' . $1 . '">';  }
 815                 else
 816                     {  $output .= '<link target="' . $linkText . '" name="' . $linkText . '" original="&lt;' . $linkText . '&gt;">';  };
 817                 }
 818
 819             else # it's not a link.
 820                 {
 821                 $output .= '&lt;';
 822                 };
 823             }
 824
 825         elsif ($textBlocks[$index] eq '*')
 826             {
 827             my $tagType = $self->TagType(\@textBlocks, $index);
 828
 829             if ($tagType == POSSIBLE_OPENING_TAG && $self->ClosingTag(\@textBlocks, $index, undef) != -1)
 830                 {
 831                 # ClosingTag() makes sure tags aren't opened multiple times in a row.
 832                 $bold = 1;
 833                 $output .= '<b>';
 834                 }
 835             elsif ($bold && $tagType == POSSIBLE_CLOSING_TAG)
 836                 {
 837                 $bold = undef;
 838                 $output .= '</b>';
 839                 }
 840             else
 841                 {
 842                 $output .= '*';
 843                 };
 844             }
 845
 846         elsif ($textBlocks[$index] eq '_')
 847             {
 848             my $tagType = $self->TagType(\@textBlocks, $index);
 849
 850              if ($tagType == POSSIBLE_OPENING_TAG && $self->ClosingTag(\@textBlocks, $index, \$underlineHasWhitespace) != -1)
 851                 {
 852                 # ClosingTag() makes sure tags aren't opened multiple times in a row.
 853                 $underline = 1;
 854                 #underlineHasWhitespace is set by ClosingTag().
 855                 $output .= '<u>';
 856                 }
 857             elsif ($underline && $tagType == POSSIBLE_CLOSING_TAG)
 858                 {
 859                 $underline = undef;
 860                 #underlineHasWhitespace will be reset by the next opening underline.
 861                 $output .= '</u>';
 862                 }
 863             elsif ($underline && !$underlineHasWhitespace)
 864                 {
 865                 # If there's no whitespace between underline tags, all underscores are replaced by spaces so
 866                 # _some_underlined_text_ becomes <u>some underlined text</u>.  The standard _some underlined text_
 867                 # will work too.
 868                 $output .= ' ';
 869                 }
 870             else
 871                 {
 872                 $output .= '_';
 873                 };
 874             }
 875
 876         else # plain text or a > that isn't part of a link
 877             {
 878             $output .= NaturalDocs::NDMarkup->ConvertAmpChars($textBlocks[$index]);
 879            };
 880
 881         $index++;
 882         };
 883
 884     return $output;
 885     };
 886
 887
 888 #
 889 #   Function: TagType
 890 #
 891 #   Returns whether the tag is a possible opening or closing tag, or neither.  "Possible" because it doesn't check if an opening tag is
 892 #   closed or a closing tag is opened, just whether the surrounding characters allow it to be a candidate for a tag.  For example, in
 893 #   "A _B" the underscore is a possible opening underline tag, but in "A_B" it is not.  Support function for <RichFormatTextBlock()>.
 894 #
 895 #   Parameters:
 896 #
 897 #       textBlocks  - A reference to an array of text blocks.
 898 #       index         - The index of the tag.
 899 #
 900 #   Returns:
 901 #
 902 #       POSSIBLE_OPENING_TAG, POSSIBLE_CLOSING_TAG, or NOT_A_TAG.
 903 #
 904 sub TagType #(textBlocks, index)
 905     {
 906     my ($self, $textBlocks, $index) = @_;
 907
 908
 909     # Possible opening tags
 910
 911     if ( ( $textBlocks->[$index] =~ /^[\*_<]$/ ) &&
 912
 913         # Before it must be whitespace, the beginning of the text, or ({["'-/*_.
 914         ( $index == 0 || $textBlocks->[$index-1] =~ /[\ \t\n\(\{\[\"\'\-\/\*\_]$/ ) &&
 915
 916         # Notes for 2.0: Include Spanish upside down ! and ? as well as opening quotes (66) and apostrophes (6).  Look into
 917         # Unicode character classes as well.
 918
 919         # After it must be non-whitespace.
 920         ( $index + 1 < scalar @$textBlocks && $textBlocks->[$index+1] !~ /^[\ \t\n]/) &&
 921
 922         # Make sure we don't accept <<, <=, <-, or *= as opening tags.
 923         ( $textBlocks->[$index] ne '<' || $textBlocks->[$index+1] !~ /^[<=-]/ ) &&
 924         ( $textBlocks->[$index] ne '*' || $textBlocks->[$index+1] !~ /^[\=\*]/ ) &&
 925
 926         # Make sure we don't accept * or _ before it unless it's <.
 927         ( $textBlocks->[$index] eq '<' || $index == 0 || $textBlocks->[$index-1] !~ /[\*\_]$/) )
 928         {
 929         return POSSIBLE_OPENING_TAG;
 930         }
 931
 932
 933     # Possible closing tags
 934
 935     elsif ( ( $textBlocks->[$index] =~ /^[\*_>]$/) &&
 936
 937             # After it must be whitespace, the end of the text, or )}].,!?"';:-/*_.
 938             ( $index + 1 == scalar @$textBlocks || $textBlocks->[$index+1] =~ /^[ \t\n\)\]\}\.\,\!\?\"\'\;\:\-\/\*\_]/ ||
 939               # Links also get plurals, like <link>s, <linx>es, <link>'s, and <links>'.
 940               ( $textBlocks->[$index] eq '>' && $textBlocks->[$index+1] =~ /^(?:es|s|\')/ ) ) &&
 941
 942             # Notes for 2.0: Include closing quotes (99) and apostrophes (9).  Look into Unicode character classes as well.
 943
 944             # Before it must be non-whitespace.
 945             ( $index != 0 && $textBlocks->[$index-1] !~ /[ \t\n]$/ ) &&
 946
 947             # Make sure we don't accept >>, ->, or => as closing tags.  >= is already taken care of.
 948             ( $textBlocks->[$index] ne '>' || $textBlocks->[$index-1] !~ /[>=-]$/ ) &&
 949
 950             # Make sure we don't accept * or _ after it unless it's >.
 951             ( $textBlocks->[$index] eq '>' || $textBlocks->[$index+1] !~ /[\*\_]$/) )
 952         {
 953         return POSSIBLE_CLOSING_TAG;
 954         }
 955
 956     else
 957         {
 958         return NOT_A_TAG;
 959         };
 960
 961     };
 962
 963
 964 #
 965 #   Function: ClosingTag
 966 #
 967 #   Returns whether a tag is closed or not, where it's closed if it is, and optionally whether there is any whitespace between the
 968 #   tags.  Support function for <RichFormatTextBlock()>.
 969 #
 970 #   The results of this function are in full context, meaning that if it says a tag is closed, it can be interpreted as that tag in the
 971 #   final output.  It takes into account any spoiling factors, like there being two opening tags in a row.
 972 #
 973 #   Parameters:
 974 #
 975 #       textBlocks             - A reference to an array of text blocks.
 976 #       index                    - The index of the opening tag.
 977 #       hasWhitespaceRef  - A reference to the variable that will hold whether there is whitespace between the tags or not.  If
 978 #                                     undef, the function will not check.  If the tag is not closed, the variable will not be changed.
 979 #
 980 #   Returns:
 981 #
 982 #       If the tag is closed, it returns the index of the closing tag and puts whether there was whitespace between the tags in
 983 #       hasWhitespaceRef if it was specified.  If the tag is not closed, it returns -1 and doesn't touch the variable pointed to by
 984 #       hasWhitespaceRef.
 985 #
 986 sub ClosingTag #(textBlocks, index, hasWhitespace)
 987     {
 988     my ($self, $textBlocks, $index, $hasWhitespaceRef) = @_;
 989
 990     my $hasWhitespace;
 991     my $closingTag;
 992
 993     if ($textBlocks->[$index] eq '*' || $textBlocks->[$index] eq '_')
 994         {  $closingTag = $textBlocks->[$index];  }
 995     elsif ($textBlocks->[$index] eq '<')
 996         {  $closingTag = '>';  }
 997     else
 998         {  return -1;  };
 999
1000     my $beginningIndex = $index;
1001     $index++;
1002
1003     while ($index < scalar @$textBlocks)
1004         {
1005         if ($textBlocks->[$index] eq '<' && $self->TagType($textBlocks, $index) == POSSIBLE_OPENING_TAG)
1006             {
1007             # If we hit a < and we're checking whether a link is closed, it's not.  The first < becomes literal and the second one
1008             # becomes the new link opening.
1009             if ($closingTag eq '>')
1010                 {
1011                 return -1;
1012                 }
1013
1014             # If we're not searching for the end of a link, we have to skip the link because formatting tags cannot appear within
1015             # them.  That's of course provided it's closed.
1016             else
1017                 {
1018                 my $linkHasWhitespace;
1019
1020                 my $endIndex = $self->ClosingTag($textBlocks, $index,
1021                                                                     ($hasWhitespaceRef && !$hasWhitespace ? \$linkHasWhitespace : undef) );
1022
1023                 if ($endIndex != -1)
1024                     {
1025                     if ($linkHasWhitespace)
1026                         {  $hasWhitespace = 1;  };
1027
1028                     # index will be incremented again at the end of the loop, which will bring us past the link's >.
1029                     $index = $endIndex;
1030                     };
1031                 };
1032             }
1033
1034         elsif ($textBlocks->[$index] eq $closingTag)
1035             {
1036             my $tagType = $self->TagType($textBlocks, $index);
1037
1038             if ($tagType == POSSIBLE_CLOSING_TAG)
1039                 {
1040                 # There needs to be something between the tags for them to count.
1041                 if ($index == $beginningIndex + 1)
1042                     {  return -1;  }
1043                 else
1044                     {
1045                     # Success!
1046
1047                     if ($hasWhitespaceRef)
1048                         {  $$hasWhitespaceRef = $hasWhitespace;  };
1049
1050                     return $index;
1051                     };
1052                 }
1053
1054             # If there are two opening tags of the same type, the first becomes literal and the next becomes part of a tag.
1055             elsif ($tagType == POSSIBLE_OPENING_TAG)
1056                 {  return -1;  }
1057             }
1058
1059         elsif ($hasWhitespaceRef && !$hasWhitespace)
1060             {
1061             if ($textBlocks->[$index] =~ /[ \t\n]/)
1062                 {  $hasWhitespace = 1;  };
1063             };
1064
1065         $index++;
1066         };
1067
1068     # Hit the end of the text blocks if we're here.
1069     return -1;
1070     };
1071
1072
1073 1;