1 ###############################################################################
3 # Package: NaturalDocs::Parser::Native
5 ###############################################################################
7 # A package that converts comments from Natural Docs' native format into <NaturalDocs::Parser::ParsedTopic> objects.
8 # Unlike most second-level packages, these are packages and not object classes.
10 ###############################################################################
12 # This file is part of Natural Docs, which is Copyright © 2003-2010 Greg Valure
13 # Natural Docs is licensed under version 3 of the GNU Affero General Public License (AGPL)
14 # Refer to License.txt for the complete details
20 package NaturalDocs::Parser::Native;
23 ###############################################################################
27 # Return values of TagType(). Not documented here.
28 use constant POSSIBLE_OPENING_TAG => 1;
29 use constant POSSIBLE_CLOSING_TAG => 2;
30 use constant NOT_A_TAG => 3;
36 # A <SymbolString> representing the package normal topics will be a part of at the current point in the file. This is a package variable
37 # because it needs to be reserved between function calls.
42 # hash: functionListIgnoredHeadings
44 # An existence hash of all the headings that prevent the parser from creating function list symbols. Whenever one of
45 # these headings are used in a function list topic, symbols are not created from definition lists until the next heading. The keys
46 # are in all lowercase.
48 my %functionListIgnoredHeadings = ( 'parameters' => 1,
58 ###############################################################################
59 # Group: Interface Functions
65 # This will be called whenever a file is about to be parsed. It allows the package to reset its internal state.
77 # Examines the comment and returns whether it is *definitely* Natural Docs content, i.e. it is owned by this package. Note
78 # that a comment can fail this function and still be interpreted as a Natural Docs content, for example a JavaDoc-styled comment
79 # that doesn't have header lines but no JavaDoc tags either.
83 # commentLines - An arrayref of the comment lines. Must have been run through <NaturalDocs::Parser->CleanComment()>.
84 # isJavaDoc - Whether the comment was JavaDoc-styled.
88 # Whether the comment is *definitely* Natural Docs content.
90 sub IsMine #(string[] commentLines, bool isJavaDoc)
92 my ($self, $commentLines, $isJavaDoc) = @_;
94 # Skip to the first line with content.
97 while ($line < scalar @$commentLines && !length $commentLines->[$line])
100 return $self->ParseHeaderLine($commentLines->[$line]);
106 # Function: ParseComment
108 # This will be called whenever a comment capable of containing Natural Docs content is found.
112 # commentLines - An arrayref of the comment lines. Must have been run through <NaturalDocs::Parser->CleanComment()>.
113 # *The original memory will be changed.*
114 # isJavaDoc - Whether the comment is JavaDoc styled.
115 # lineNumber - The line number of the first of the comment lines.
116 # parsedTopics - A reference to the array where any new <NaturalDocs::Parser::ParsedTopics> should be placed.
120 # The number of parsed topics added to the array, or zero if none.
122 sub ParseComment #(commentLines, isJavaDoc, lineNumber, parsedTopics)
124 my ($self, $commentLines, $isJavaDoc, $lineNumber, $parsedTopics) = @_;
127 my $prevLineBlank = 1;
128 my $inCodeSection = 0;
130 my ($type, $scope, $isPlural, $title, $symbol);
131 #my $package; # package variable.
132 my ($newKeyword, $newTitle);
137 my $bodyEnd = 0; # Not inclusive.
139 while ($index < scalar @$commentLines)
141 # Everything but leading whitespace was removed beforehand.
143 # If we're in a code section...
146 if ($commentLines->[$index] =~ /^ *\( *(?:end|finish|done)(?: +(?:table|code|example|diagram))? *\)$/i)
147 { $inCodeSection = undef; };
153 # If the line is empty...
154 elsif (!length($commentLines->[$index]))
162 # If the line has a recognized header and the previous line is blank...
163 elsif ($prevLineBlank && (($newKeyword, $newTitle) = $self->ParseHeaderLine($commentLines->[$index])) )
165 # Process the previous one, if any.
169 if ($scope == ::SCOPE_START() || $scope == ::SCOPE_END())
170 { $package = undef; };
172 my $body = $self->FormatBody($commentLines, $bodyStart, $bodyEnd, $type, $isPlural);
173 my $newTopic = $self->MakeParsedTopic($type, $title, $package, $body, $lineNumber + $bodyStart - 1, $isPlural);
174 push @$parsedTopics, $newTopic;
176 $package = $newTopic->Package();
182 ($type, $typeInfo, $isPlural) = NaturalDocs::Topics->KeywordInfo($newKeyword);
183 $scope = $typeInfo->Scope();
185 $bodyStart = $index + 1;
186 $bodyEnd = $index + 1;
193 # If we're on a non-empty, non-header line of a JavaDoc-styled comment and we haven't started a topic yet...
194 elsif ($isJavaDoc && !$topicCount)
197 $scope = ::SCOPE_NORMAL(); # The scope repair and topic merging processes will handle if this is a class topic.
203 $bodyEnd = $index + 1;
207 $prevLineBlank = undef;
210 # If we're on a normal content line within a topic
216 if ($commentLines->[$index] =~ /^ *\( *(?:(?:start|begin)? +)?(?:table|code|example|diagram) *\)$/i)
217 { $inCodeSection = 1; };
225 # Last one, if any. This is the only one that gets the prototypes.
228 if ($scope == ::SCOPE_START() || $scope == ::SCOPE_END())
229 { $package = undef; };
231 my $body = $self->FormatBody($commentLines, $bodyStart, $bodyEnd, $type, $isPlural);
232 my $newTopic = $self->MakeParsedTopic($type, $title, $package, $body, $lineNumber + $bodyStart - 1, $isPlural);
233 push @$parsedTopics, $newTopic;
236 $package = $newTopic->Package();
244 # Function: ParseHeaderLine
246 # If the passed line is a topic header, returns the array ( keyword, title ). Otherwise returns an empty array.
248 sub ParseHeaderLine #(line)
250 my ($self, $line) = @_;
252 if ($line =~ /^ *([a-z0-9 ]*[a-z0-9]): +(.*)$/i)
254 my ($keyword, $title) = ($1, $2);
256 # We need to do it this way because if you do "if (ND:T->KeywordInfo($keyword)" and the last element of the array it
257 # returns is false, the statement is false. That is really retarded, but there it is.
258 my ($type, undef, undef) = NaturalDocs::Topics->KeywordInfo($keyword);
261 { return ($keyword, $title); }
271 ###############################################################################
272 # Group: Support Functions
276 # Function: MakeParsedTopic
278 # Creates a <NaturalDocs::Parser::ParsedTopic> object for the passed parameters. Scope is gotten from
279 # the package variable <package> instead of from the parameters. The summary is generated from the body.
283 # type - The <TopicType>. May be undef for headerless topics.
284 # title - The title of the topic. May be undef for headerless topics.
285 # package - The package <SymbolString> the topic appears in.
286 # body - The topic's body in <NDMarkup>.
287 # lineNumber - The topic's line number.
288 # isList - Whether the topic is a list.
292 # The <NaturalDocs::Parser::ParsedTopic> object.
294 sub MakeParsedTopic #(type, title, package, body, lineNumber, isList)
296 my ($self, $type, $title, $package, $body, $lineNumber, $isList) = @_;
301 { $summary = NaturalDocs::Parser->GetSummaryFromBody($body); };
303 return NaturalDocs::Parser::ParsedTopic->New($type, $title, $package, undef, undef, $summary,
304 $body, $lineNumber, $isList);
309 # Function: FormatBody
311 # Converts the section body to <NDMarkup>.
315 # commentLines - The arrayref of comment lines.
316 # startingIndex - The starting index of the body to format.
317 # endingIndex - The ending index of the body to format, *not* inclusive.
318 # type - The type of the section. May be undef for headerless comments.
319 # isList - Whether it's a list topic.
323 # The body formatted in <NDMarkup>.
325 sub FormatBody #(commentLines, startingIndex, endingIndex, type, isList)
327 my ($self, $commentLines, $startingIndex, $endingIndex, $type, $isList) = @_;
329 use constant TAG_NONE => 1;
330 use constant TAG_PARAGRAPH => 2;
331 use constant TAG_BULLETLIST => 3;
332 use constant TAG_DESCRIPTIONLIST => 4;
333 use constant TAG_HEADING => 5;
334 use constant TAG_PREFIXCODE => 6;
335 use constant TAG_TAGCODE => 7;
337 my %tagEnders = ( TAG_NONE() => '',
338 TAG_PARAGRAPH() => '</p>',
339 TAG_BULLETLIST() => '</li></ul>',
340 TAG_DESCRIPTIONLIST() => '</dd></dl>',
341 TAG_HEADING() => '</h>',
342 TAG_PREFIXCODE() => '</code>',
343 TAG_TAGCODE() => '</code>' );
345 my $topLevelTag = TAG_NONE;
349 my $prevLineBlank = 1;
352 my $removedCodeSpaces;
354 my $ignoreListSymbols;
356 my $index = $startingIndex;
358 while ($index < $endingIndex)
360 # If we're in a tagged code section...
361 if ($topLevelTag == TAG_TAGCODE)
363 if ($commentLines->[$index] =~ /^ *\( *(?:end|finish|done)(?: +(?:table|code|example|diagram))? *\)$/i)
365 $codeBlock =~ s/\n+$//;
366 $output .= NaturalDocs::NDMarkup->ConvertAmpChars($codeBlock) . '</code>';
368 $topLevelTag = TAG_NONE;
369 $prevLineBlank = undef;
373 $self->AddToCodeBlock($commentLines->[$index], \$codeBlock, \$removedCodeSpaces);
377 # If the line starts with a code designator...
378 elsif ($commentLines->[$index] =~ /^ *[>:|](.*)$/)
382 if ($topLevelTag == TAG_PREFIXCODE)
384 $self->AddToCodeBlock($code, \$codeBlock, \$removedCodeSpaces);
386 else # $topLevelTag != TAG_PREFIXCODE
388 if (defined $textBlock)
390 $output .= $self->RichFormatTextBlock($textBlock) . $tagEnders{$topLevelTag};
394 $topLevelTag = TAG_PREFIXCODE;
395 $output .= '<code type="anonymous">';
396 $self->AddToCodeBlock($code, \$codeBlock, \$removedCodeSpaces);
400 # If we're not in either code style...
403 # Strip any leading whitespace.
404 $commentLines->[$index] =~ s/^ +//;
406 # If we were in a prefixed code section...
407 if ($topLevelTag == TAG_PREFIXCODE)
409 $codeBlock =~ s/\n+$//;
410 $output .= NaturalDocs::NDMarkup->ConvertAmpChars($codeBlock) . '</code>';
412 $topLevelTag = TAG_NONE;
413 $prevLineBlank = undef;
417 # If the line is blank...
418 if (!length($commentLines->[$index]))
420 # End a paragraph. Everything else ignores it for now.
421 if ($topLevelTag == TAG_PARAGRAPH)
423 $output .= $self->RichFormatTextBlock($textBlock) . '</p>';
425 $topLevelTag = TAG_NONE;
431 # If the line starts with a bullet...
432 elsif ($commentLines->[$index] =~ /^[-\*o+] +([^ ].*)$/ &&
433 substr($1, 0, 2) ne '- ') # Make sure "o - Something" is a definition, not a bullet.
435 my $bulletedText = $1;
437 if (defined $textBlock)
438 { $output .= $self->RichFormatTextBlock($textBlock); };
440 if ($topLevelTag == TAG_BULLETLIST)
442 $output .= '</li><li>';
444 else #($topLevelTag != TAG_BULLETLIST)
446 $output .= $tagEnders{$topLevelTag} . '<ul><li>';
447 $topLevelTag = TAG_BULLETLIST;
450 $textBlock = $bulletedText;
452 $prevLineBlank = undef;
455 # If the line looks like a description list entry...
456 elsif ($commentLines->[$index] =~ /^(.+?) +- +([^ ].*)$/ && $topLevelTag != TAG_PARAGRAPH)
459 my $description = $2;
461 if (defined $textBlock)
462 { $output .= $self->RichFormatTextBlock($textBlock); };
464 if ($topLevelTag == TAG_DESCRIPTIONLIST)
468 else #($topLevelTag != TAG_DESCRIPTIONLIST)
470 $output .= $tagEnders{$topLevelTag} . '<dl>';
471 $topLevelTag = TAG_DESCRIPTIONLIST;
474 if (($isList && !$ignoreListSymbols) || $type eq ::TOPIC_ENUMERATION())
476 $output .= '<ds>' . NaturalDocs::NDMarkup->ConvertAmpChars($entry) . '</ds><dd>';
480 $output .= '<de>' . NaturalDocs::NDMarkup->ConvertAmpChars($entry) . '</de><dd>';
483 $textBlock = $description;
485 $prevLineBlank = undef;
488 # If the line could be a header...
489 elsif ($prevLineBlank && $commentLines->[$index] =~ /^(.*)([^ ]):$/)
491 my $headerText = $1 . $2;
493 if (defined $textBlock)
495 $output .= $self->RichFormatTextBlock($textBlock);
499 $output .= $tagEnders{$topLevelTag};
500 $topLevelTag = TAG_NONE;
502 $output .= '<h>' . $self->RichFormatTextBlock($headerText) . '</h>';
504 if ($type eq ::TOPIC_FUNCTION() && $isList)
506 $ignoreListSymbols = exists $functionListIgnoredHeadings{lc($headerText)};
509 $prevLineBlank = undef;
512 # If the line looks like a code tag...
513 elsif ($commentLines->[$index] =~ /^\( *(?:(?:start|begin)? +)?(table|code|example|diagram) *\)$/i)
515 my $codeType = lc($1);
517 if (defined $textBlock)
519 $output .= $self->RichFormatTextBlock($textBlock);
523 if ($codeType eq 'example')
524 { $codeType = 'anonymous'; }
525 elsif ($codeType eq 'table' || $codeType eq 'diagram')
526 { $codeType = 'text'; }
527 # else leave it 'code'
529 $output .= $tagEnders{$topLevelTag} . '<code type="' . $codeType . '">';
530 $topLevelTag = TAG_TAGCODE;
533 # If the line looks like an inline image...
534 elsif ($commentLines->[$index] =~ /^(\( *see +)([^\)]+?)( *\))$/i)
536 if (defined $textBlock)
538 $output .= $self->RichFormatTextBlock($textBlock);
542 $output .= $tagEnders{$topLevelTag};
543 $topLevelTag = TAG_NONE;
545 $output .= '<img mode="inline" target="' . NaturalDocs::NDMarkup->ConvertAmpChars($2) . '" '
546 . 'original="' . NaturalDocs::NDMarkup->ConvertAmpChars($1 . $2 . $3) . '">';
548 $prevLineBlank = undef;
551 # If the line isn't any of those, we consider it normal text.
554 # A blank line followed by normal text ends lists. We don't handle this when we detect if the line's blank because
555 # we don't want blank lines between list items to break the list.
556 if ($prevLineBlank && ($topLevelTag == TAG_BULLETLIST || $topLevelTag == TAG_DESCRIPTIONLIST))
558 $output .= $self->RichFormatTextBlock($textBlock) . $tagEnders{$topLevelTag} . '<p>';
560 $topLevelTag = TAG_PARAGRAPH;
564 elsif ($topLevelTag == TAG_NONE)
567 $topLevelTag = TAG_PARAGRAPH;
568 # textBlock will already be undef.
571 if (defined $textBlock)
572 { $textBlock .= ' '; };
574 $textBlock .= $commentLines->[$index];
576 $prevLineBlank = undef;
583 # Clean up anything left dangling.
584 if (defined $textBlock)
586 $output .= $self->RichFormatTextBlock($textBlock) . $tagEnders{$topLevelTag};
588 elsif (defined $codeBlock)
590 $codeBlock =~ s/\n+$//;
591 $output .= NaturalDocs::NDMarkup->ConvertAmpChars($codeBlock) . '</code>';
599 # Function: AddToCodeBlock
601 # Adds a line of text to a code block, handling all the indentation processing required.
605 # line - The line of text to add.
606 # codeBlockRef - A reference to the code block to add it to.
607 # removedSpacesRef - A reference to a variable to hold the number of spaces removed. It needs to be stored between calls.
608 # It will reset itself automatically when the code block codeBlockRef points to is undef.
610 sub AddToCodeBlock #(line, codeBlockRef, removedSpacesRef)
612 my ($self, $line, $codeBlockRef, $removedSpacesRef) = @_;
614 $line =~ /^( *)(.*)$/;
615 my ($spaces, $code) = ($1, $2);
617 if (!defined $$codeBlockRef)
621 $$codeBlockRef = $code . "\n";
622 $$removedSpacesRef = length($spaces);
624 # else ignore leading line breaks.
629 # Make sure we have the minimum amount of spaces to the left possible.
630 if (length($spaces) != $$removedSpacesRef)
632 my $spaceDifference = abs( length($spaces) - $$removedSpacesRef );
633 my $spacesToAdd = ' ' x $spaceDifference;
635 if (length($spaces) > $$removedSpacesRef)
637 $$codeBlockRef .= $spacesToAdd;
641 $$codeBlockRef =~ s/^(.)/$spacesToAdd . $1/gme;
642 $$removedSpacesRef = length($spaces);
646 $$codeBlockRef .= $code . "\n";
649 else # (!length $code)
651 $$codeBlockRef .= "\n";
657 # Function: RichFormatTextBlock
659 # Applies rich <NDMarkup> formatting to a chunk of text. This includes both amp chars, formatting tags, and link tags.
663 # text - The block of text to format.
667 # The formatted text block.
669 sub RichFormatTextBlock #(text)
671 my ($self, $text) = @_;
675 # First find bare urls, e-mail addresses, and images. We have to do this before the split because they may contain underscores
676 # or asterisks. We have to mark the tags with \x1E and \x1F so they don't get confused with angle brackets from the comment.
677 # We can't convert the amp chars beforehand because we need lookbehinds in the regexps below and they need to be
678 # constant length. Sucks, huh?
681 # The previous character can't be an alphanumeric or an opening angle bracket.
684 # Optional mailto:. Ignored in output.
690 # The user portion. Alphanumeric and - _. Dots can appear between, but not at the edges or more than
692 (?: [a-z0-9\-_]+ \. )* [a-z0-9\-_]+
696 # The domain. Alphanumeric and -. Dots same as above, however, there must be at least two sections
697 # and the last one must be two to four alphanumeric characters (.com, .uk, .info, .203 for IP addresses)
698 (?: [a-z0-9\-]+ \. )+ [a-z]{2,4}
703 # The next character can't be an alphanumeric, which should prevent .abcde from matching the two to
704 # four character requirement, or a closing angle bracket.
709 {"\x1E" . 'email target="' . NaturalDocs::NDMarkup->ConvertAmpChars($1) . '" '
710 . 'name="' . NaturalDocs::NDMarkup->ConvertAmpChars($1) . '"' . "\x1F"}igxe;
713 # The previous character can't be an alphanumeric or an opening angle bracket.
719 # URL must start with one of the acceptable protocols.
720 (?:http|https|ftp|news|file)\:
722 # The acceptable URL characters as far as I know.
723 [a-z0-9\-\=\~\@\#\%\&\_\+\/\;\:\?\*\.\,]*
725 # The URL characters minus period and comma. If it ends on them, they're probably intended as
727 [a-z0-9\-\=\~\@\#\%\&\_\+\/\;\:\?\*]
732 # The next character must not be an acceptable character or a closing angle bracket. It must also not be a
733 # dot and then an acceptable character. These will prevent the URL from ending early just to get a match.
734 (?! \.?[a-z0-9\-\=\~\@\#\%\&\_\+\/\;\:\?\*\>] )
738 {"\x1E" . 'url target="' . NaturalDocs::NDMarkup->ConvertAmpChars($1) . '" '
739 . 'name="' . NaturalDocs::NDMarkup->ConvertAmpChars($1) . '"' . "\x1F"}igxe;
742 # Find image links. Inline images should already be pulled out by now.
744 $text =~ s{(\( *see +)([^\)\<\>]+?)( *\))}
745 {"\x1E" . 'img mode="link" target="' . NaturalDocs::NDMarkup->ConvertAmpChars($2) . '" '
746 . 'original="' . NaturalDocs::NDMarkup->ConvertAmpChars($1 . $2 . $3) . '"' . "\x1F"}gie;
750 # Split the text from the potential tags.
752 my @tempTextBlocks = split(/([\*_<>\x1E\x1F])/, $text);
754 # Since the symbols are considered dividers, empty strings could appear between two in a row or at the beginning/end of the
755 # array. This could seriously screw up TagType(), so we need to get rid of them.
758 while (scalar @tempTextBlocks)
760 my $tempTextBlock = shift @tempTextBlocks;
762 if (length $tempTextBlock)
763 { push @textBlocks, $tempTextBlock; };
769 my $underlineHasWhitespace;
773 while ($index < scalar @textBlocks)
775 if ($textBlocks[$index] eq "\x1E")
780 while ($textBlocks[$index] ne "\x1F")
782 $output .= $textBlocks[$index];
789 elsif ($textBlocks[$index] eq '<' && $self->TagType(\@textBlocks, $index) == POSSIBLE_OPENING_TAG)
791 my $endingIndex = $self->ClosingTag(\@textBlocks, $index, undef);
793 if ($endingIndex != -1)
798 while ($index < $endingIndex)
800 $linkText .= $textBlocks[$index];
803 # Index will be incremented again at the end of the loop.
805 $linkText = NaturalDocs::NDMarkup->ConvertAmpChars($linkText);
807 if ($linkText =~ /^(?:mailto\:)?((?:[a-z0-9\-_]+\.)*[a-z0-9\-_]+@(?:[a-z0-9\-]+\.)+[a-z]{2,4})$/i)
808 { $output .= '<email target="' . $1 . '" name="' . $1 . '">'; }
809 elsif ($linkText =~ /^(.+?) at (?:mailto\:)?((?:[a-z0-9\-_]+\.)*[a-z0-9\-_]+@(?:[a-z0-9\-]+\.)+[a-z]{2,4})$/i)
810 { $output .= '<email target="' . $2 . '" name="' . $1 . '">'; }
811 elsif ($linkText =~ /^(?:http|https|ftp|news|file)\:/i)
812 { $output .= '<url target="' . $linkText . '" name="' . $linkText . '">'; }
813 elsif ($linkText =~ /^(.+?) at ((?:http|https|ftp|news|file)\:.+)/i)
814 { $output .= '<url target="' . $2 . '" name="' . $1 . '">'; }
816 { $output .= '<link target="' . $linkText . '" name="' . $linkText . '" original="<' . $linkText . '>">'; };
819 else # it's not a link.
825 elsif ($textBlocks[$index] eq '*')
827 my $tagType = $self->TagType(\@textBlocks, $index);
829 if ($tagType == POSSIBLE_OPENING_TAG && $self->ClosingTag(\@textBlocks, $index, undef) != -1)
831 # ClosingTag() makes sure tags aren't opened multiple times in a row.
835 elsif ($bold && $tagType == POSSIBLE_CLOSING_TAG)
846 elsif ($textBlocks[$index] eq '_')
848 my $tagType = $self->TagType(\@textBlocks, $index);
850 if ($tagType == POSSIBLE_OPENING_TAG && $self->ClosingTag(\@textBlocks, $index, \$underlineHasWhitespace) != -1)
852 # ClosingTag() makes sure tags aren't opened multiple times in a row.
854 #underlineHasWhitespace is set by ClosingTag().
857 elsif ($underline && $tagType == POSSIBLE_CLOSING_TAG)
860 #underlineHasWhitespace will be reset by the next opening underline.
863 elsif ($underline && !$underlineHasWhitespace)
865 # If there's no whitespace between underline tags, all underscores are replaced by spaces so
866 # _some_underlined_text_ becomes <u>some underlined text</u>. The standard _some underlined text_
876 else # plain text or a > that isn't part of a link
878 $output .= NaturalDocs::NDMarkup->ConvertAmpChars($textBlocks[$index]);
891 # Returns whether the tag is a possible opening or closing tag, or neither. "Possible" because it doesn't check if an opening tag is
892 # closed or a closing tag is opened, just whether the surrounding characters allow it to be a candidate for a tag. For example, in
893 # "A _B" the underscore is a possible opening underline tag, but in "A_B" it is not. Support function for <RichFormatTextBlock()>.
897 # textBlocks - A reference to an array of text blocks.
898 # index - The index of the tag.
902 # POSSIBLE_OPENING_TAG, POSSIBLE_CLOSING_TAG, or NOT_A_TAG.
904 sub TagType #(textBlocks, index)
906 my ($self, $textBlocks, $index) = @_;
909 # Possible opening tags
911 if ( ( $textBlocks->[$index] =~ /^[\*_<]$/ ) &&
913 # Before it must be whitespace, the beginning of the text, or ({["'-/*_.
914 ( $index == 0 || $textBlocks->[$index-1] =~ /[\ \t\n\(\{\[\"\'\-\/\*\_]$/ ) &&
916 # Notes for 2.0: Include Spanish upside down ! and ? as well as opening quotes (66) and apostrophes (6). Look into
917 # Unicode character classes as well.
919 # After it must be non-whitespace.
920 ( $index + 1 < scalar @$textBlocks && $textBlocks->[$index+1] !~ /^[\ \t\n]/) &&
922 # Make sure we don't accept <<, <=, <-, or *= as opening tags.
923 ( $textBlocks->[$index] ne '<' || $textBlocks->[$index+1] !~ /^[<=-]/ ) &&
924 ( $textBlocks->[$index] ne '*' || $textBlocks->[$index+1] !~ /^[\=\*]/ ) &&
926 # Make sure we don't accept * or _ before it unless it's <.
927 ( $textBlocks->[$index] eq '<' || $index == 0 || $textBlocks->[$index-1] !~ /[\*\_]$/) )
929 return POSSIBLE_OPENING_TAG;
933 # Possible closing tags
935 elsif ( ( $textBlocks->[$index] =~ /^[\*_>]$/) &&
937 # After it must be whitespace, the end of the text, or )}].,!?"';:-/*_.
938 ( $index + 1 == scalar @$textBlocks || $textBlocks->[$index+1] =~ /^[ \t\n\)\]\}\.\,\!\?\"\'\;\:\-\/\*\_]/ ||
939 # Links also get plurals, like <link>s, <linx>es, <link>'s, and <links>'.
940 ( $textBlocks->[$index] eq '>' && $textBlocks->[$index+1] =~ /^(?:es|s|\')/ ) ) &&
942 # Notes for 2.0: Include closing quotes (99) and apostrophes (9). Look into Unicode character classes as well.
944 # Before it must be non-whitespace.
945 ( $index != 0 && $textBlocks->[$index-1] !~ /[ \t\n]$/ ) &&
947 # Make sure we don't accept >>, ->, or => as closing tags. >= is already taken care of.
948 ( $textBlocks->[$index] ne '>' || $textBlocks->[$index-1] !~ /[>=-]$/ ) &&
950 # Make sure we don't accept * or _ after it unless it's >.
951 ( $textBlocks->[$index] eq '>' || $textBlocks->[$index+1] !~ /[\*\_]$/) )
953 return POSSIBLE_CLOSING_TAG;
965 # Function: ClosingTag
967 # Returns whether a tag is closed or not, where it's closed if it is, and optionally whether there is any whitespace between the
968 # tags. Support function for <RichFormatTextBlock()>.
970 # The results of this function are in full context, meaning that if it says a tag is closed, it can be interpreted as that tag in the
971 # final output. It takes into account any spoiling factors, like there being two opening tags in a row.
975 # textBlocks - A reference to an array of text blocks.
976 # index - The index of the opening tag.
977 # hasWhitespaceRef - A reference to the variable that will hold whether there is whitespace between the tags or not. If
978 # undef, the function will not check. If the tag is not closed, the variable will not be changed.
982 # If the tag is closed, it returns the index of the closing tag and puts whether there was whitespace between the tags in
983 # hasWhitespaceRef if it was specified. If the tag is not closed, it returns -1 and doesn't touch the variable pointed to by
986 sub ClosingTag #(textBlocks, index, hasWhitespace)
988 my ($self, $textBlocks, $index, $hasWhitespaceRef) = @_;
993 if ($textBlocks->[$index] eq '*' || $textBlocks->[$index] eq '_')
994 { $closingTag = $textBlocks->[$index]; }
995 elsif ($textBlocks->[$index] eq '<')
996 { $closingTag = '>'; }
1000 my $beginningIndex = $index;
1003 while ($index < scalar @$textBlocks)
1005 if ($textBlocks->[$index] eq '<' && $self->TagType($textBlocks, $index) == POSSIBLE_OPENING_TAG)
1007 # If we hit a < and we're checking whether a link is closed, it's not. The first < becomes literal and the second one
1008 # becomes the new link opening.
1009 if ($closingTag eq '>')
1014 # If we're not searching for the end of a link, we have to skip the link because formatting tags cannot appear within
1015 # them. That's of course provided it's closed.
1018 my $linkHasWhitespace;
1020 my $endIndex = $self->ClosingTag($textBlocks, $index,
1021 ($hasWhitespaceRef && !$hasWhitespace ? \$linkHasWhitespace : undef) );
1023 if ($endIndex != -1)
1025 if ($linkHasWhitespace)
1026 { $hasWhitespace = 1; };
1028 # index will be incremented again at the end of the loop, which will bring us past the link's >.
1034 elsif ($textBlocks->[$index] eq $closingTag)
1036 my $tagType = $self->TagType($textBlocks, $index);
1038 if ($tagType == POSSIBLE_CLOSING_TAG)
1040 # There needs to be something between the tags for them to count.
1041 if ($index == $beginningIndex + 1)
1047 if ($hasWhitespaceRef)
1048 { $$hasWhitespaceRef = $hasWhitespace; };
1054 # If there are two opening tags of the same type, the first becomes literal and the next becomes part of a tag.
1055 elsif ($tagType == POSSIBLE_OPENING_TAG)
1059 elsif ($hasWhitespaceRef && !$hasWhitespace)
1061 if ($textBlocks->[$index] =~ /[ \t\n]/)
1062 { $hasWhitespace = 1; };
1068 # Hit the end of the text blocks if we're here.