1 # This code is a part of Slash, and is released under the GPL.
2 # Copyright 1997-2009 by Geeknet, Inc. See README
3 # and COPYING for more information, or see http://slashcode.com/.
5 package Newslash::Util::TextFormatter;
12 use Exporter 'import';
13 our @EXPORT_OK = qw(strip_by_mode);
20 use constant ANCHOR => -4;
21 use constant NOTAGS => -3;
22 use constant ATTRIBUTE => -2;
23 use constant LITERAL => -1;
24 use constant NOHTML => 0;
25 use constant PLAINTEXT => 1;
26 use constant HTML => 2;
27 use constant EXTRANS => 3;
28 use constant CODE => 4;
30 use constant APPROVED_URL_SCHEMES_REGEX => qr/(?-xism:(?:ftp|http|gopher|mailto|news|nntp|telnet|wais|https))/;
32 #========================================================================
34 =head2 breakHtml(TEXT, MAX_WORD_LENGTH)
36 Private function. Break up long words in some text. Will ignore the
37 contents of HTML tags. Called from C<stripByMode> functions -- if
38 there are any HTML tags in the text, C<stripBadHtml> will have been
39 called first. Handles spaces before dot-words so as to best work around a
40 Microsoft bug. This code largely contributed by Joe Groff <joe at pknet
55 The maximum length of a word. Default is 50 (breakhtml_wordlength in vars).
67 use constant BREAK_HTML_WORDLENGTH => 8000;
68 use constant COMMENT_STARTWORD_WORKAROUND => 0;
69 use constant TWEAK_JAPANESE => 1;
70 use constant APPROVEDTAGS_BREAK => ['p','br','ol','ul','li','dl','dt',
71 'dd','blockquote','div','img','hr',
72 'h1','h2','h3','h4','h5','h6'
74 use constant COMMENT_NONSTARTWORDCHARS_REGEX =>
75 qr{(?i-xsm:(\s+)((?:<[^>]+>)*(?:[\.\,\;\:\\/]|&#(?:46|x2e|44|x2c|59|x3b|58|x3a|47|x2f);)+)(\S))};
76 use constant APPROVEDTAGS => [ 'b','i','p','br','a','ol','ul',
77 'li','dl','dt','dd','em','strong',
78 'tt','blockquote','div','ecode','del',
79 'ins','sub','sup','quote','strike'
81 use constant APPROVEDTAGS_ATTR => {
82 'ol' => { 'start' => {'ord' => 1} },
84 'href' => { 'req' => 1,
89 'blockquote' => { 'title' => {'ord' => 2},
90 'cite' => {'ord' => 1,'url' => 1}
92 'img' => { 'width' => {'ord' => 3},
93 'alt' => {'req' => 2,'ord' => 2},
94 'longdesc' => {'ord' => 5,'url' => 1},
95 'src' => {'req' => 1,'ord' => 1,'url' => 1},
96 'height' => {'ord' => 4},
101 my ($whole_tag, $opts) = shift;
102 my $allowed_tags = $opts->{allowed_tags} || {};
103 my $r = EscapeHTML::escape_tag($allowed_tags, $whole_tag);
107 sub approve_charref {
108 my $char_ref = shift;
114 return '' if !defined $text || $text eq '';
116 my $scheme_regex = APPROVED_URL_SCHEMES_REGEX;
118 # we know this can break real URLs, but probably will
119 # preserve real URLs more often than it will break them
121 # should we parse the HTML instead? problematic ...
122 $text =~ s{(?<!\S)((?:$scheme_regex):/{0,2}[$URI::uric#]+)}{
123 #my $url = fudgeurl($1);
126 $extra = $1 if $url =~ s/([?!;:.,']+)$//;
127 $extra = ')' . $extra if $url !~ /\(/ && $url =~ s/\)$//;
128 #print STDERR "url2html s/// url='$url' extra='$extra'\n" if !defined($url) || !defined($extra);
129 qq[<a href="$url" rel="url2html-$$">$url</a>$extra];
131 # url2html-$$ is so we can remove the whole thing later for ecode
137 my($text, $mwl) = @_;
138 $mwl = $mwl || BREAK_HTML_WORDLENGTH;
140 # Only do the <NOBR> and <WBR> bug workaround if wanted.
141 my $workaround_start = COMMENT_STARTWORD_WORKAROUND
143 my $workaround_end = COMMENT_STARTWORD_WORKAROUND
144 ? "<wbr></nobr> " : " ";
146 if (TWEAK_JAPANESE) {
147 $workaround_start = "";
148 $workaround_end = "";
151 # These are tags that "break" a word;
152 # a<P>b</P> breaks words, y<B>z</B> does not
153 my $approvedtags_break = APPROVEDTAGS_BREAK || [];
154 my $break_tag = join '|', @$approvedtags_break;
155 $break_tag = qr{(?:$break_tag)}i;
157 # This is the regex that finds a char that, at the start of
158 # a word, will trigger Microsoft's bug. It's already been
159 # set up for us, it just needs a shorter name.
160 my $nswcr = COMMENT_NONSTARTWORDCHARS_REGEX;
162 # And we also need a regex that will find an HTML entity or
163 # character references, excluding ones that would break words:
164 # a non-breaking entity. For now, let's assume *all* entities
165 # are non-breaking (except an encoded space which would be
174 # Mark off breaking tags, as we don't want them counted as
180 # Temporarily hide whitespace inside tags so that the regex below
181 # won't accidentally catch attributes, e.g. the HREF= of an A tag.
182 # (Which I don't think it can do anyway, based on the way the
183 # following regex gobbles <> and the fact that tags should already
184 # be balanced by this point...but this can't hurt - Jamie)
186 (<[^>\s]*) # Seek in a tab up to its
187 \s+ # first whitespace
188 }{$1\x00}gsx; # and replace the space with NUL
190 # Put the <wbr> in front of attempts to exploit MSIE's
191 # half-braindead adherance to Unicode char breaking.
192 $text =~ s{$nswcr}{<nobr> <wbr></nobr>$2$3}gs if COMMENT_STARTWORD_WORKAROUND;
194 # Break up overlong words, treating entities/character references
195 # as single characters and ignoring HTML tags.
197 (?:^|\G|\s) # Must start at a word bound
199 (?>(?:<[^>]+>)*) # Eat up HTML tags
200 ( # followed by either
201 $nbe # an entity (char. ref.)
202 | (?!$nbe)\S # or an ordinary char
204 ){$mwl} # $mwl non-HTML-tag chars in a row
206 substr($1, 0, -length($2))
208 . substr($1, -length($2))
212 # Just to be tidy, if we appended that word break at the very end
213 # of the text, eliminate it.
214 $text =~ s{<nobr> <wbr></nobr>\s*$}{} if COMMENT_STARTWORD_WORKAROUND;
223 # Change other NULs back to whitespace.
224 $text =~ s{\x00}{ }g;
230 #========================================================================
232 =head2 processCustomTagsPre(STRING)
234 =head2 processCustomTagsPost(STRING)
236 Private function. It does processing of special custom tags (in Pre, ECODE;
247 String to be processed.
257 Pre is meant to be used before C<approveTag> is called; Post after.
258 Both are called only from regular posting modes, HTML and PLAINTEXT.
264 sub process_custom_tags_pre {
267 ## Deal with special ECODE tag (Embedded Code). This tag allows
268 ## embedding the Code postmode in plain or HTML modes. It may be
270 ## <ECODE>literal text</ECODE>
271 ## or, for the case where "</ECODE>" is needed in the text:
272 ## <ECODE END="SOMETAG">literal text</SOMETAG>
274 ## SOMETAG must match /^\w+$/.
277 ## Note that we also strip out leading and trailing newlines
278 ## surrounding the tags, because in plain text mode this can
279 ## be hard to manage, so we manage it for the user.
281 ## Also note that this won't work if the site disallows TT
282 ## or BLOCKQUOTE tags.
286 # ECODE must be in approvedtags
287 if (grep /^ecode$/i, @{APPROVEDTAGS()}) {
288 $str =~ s|<(/?)literal>|<${1}ecode>|gi; # we used to accept "literal" too
290 my $open = qr[\n* <\s* (?:$ecode) (?: \s+ END="(\w+)")? \s*> \n*]xsio;
291 my $close_1 = qr[($open (.*?) \n* <\s* /\2 \s*> \n*)]xsio; # if END is used
292 my $close_2 = qr[($open (.*?) \n* <\s* /ECODE \s*> \n*)]xsio; # if END is not used
294 while ($str =~ m[($open)]g) {
295 my $len = length($1);
297 my $pos = pos($str) - $len;
299 my $close = $end ? $close_1 : $close_2;
300 my $substr = substr($str, $pos);
301 if ($substr =~ m/^$close/si) {
302 my $len = length($1);
304 # remove these if they were added by url2html; I know
305 # this is a rather cheesy way to do this, but c'est la vie
307 $codestr =~ s{<a href="[^"]+" rel="url2html-$$">(.+?)</a>}{$1}g;
308 my $code = strip_code($codestr);
309 my $newstr = "<p><blockquote>$code</blockquote></p>";
310 substr($str, $pos, $len) = $newstr;
311 pos($str) = $pos + length($newstr);
318 sub process_custom_tags_post {
321 # QUOTE must be in approvedtags
322 if (grep /^quote$/i, @{APPROVEDTAGS()}) {
324 my $open = qr[\n* <\s* $quote \s*> \n*]xsio;
325 my $close = qr[\n* <\s* /$quote \s*> \n*]xsio;
327 $str =~ s/$open/<p><div class="quote">/g;
328 $str =~ s/$close/<\/div><\/p>/g;
331 # just fix the whitespace for blockquote to something that looks
333 if (grep /^blockquote$/i, @{APPROVEDTAGS()}) {
334 my $quote = 'blockquote';
335 my $open = qr[\s* <\s* $quote \s*> \n*]xsio;
336 my $close = qr[\s* <\s* /$quote \s*> \n*]xsio;
338 $str =~ s/(?<!<p>)$open/<p><$quote>/g;
346 my %action_data = ();
349 newline_to_local => sub {
350 ${$_[0]} =~ s/(?:\015?\012|\015)/\n/g; },
351 trailing_whitespace => sub {
352 ${$_[0]} =~ s/[\t ]+\n/\n/g; },
353 encode_html_amp => sub {
354 ${$_[0]} =~ s/&/&/g; },
355 encode_html_amp_ifnotent => sub {
356 ${$_[0]} =~ s/&(?!#?[a-zA-Z0-9]+;)/&/g; },
357 encode_html_ltgt => sub {
358 ${$_[0]} =~ s/</</g;
359 ${$_[0]} =~ s/>/>/g; },
360 encode_html_ltgt_stray => sub {
361 1 while ${$_[0]} =~ s{
362 ( (?: ^ | > ) [^<]* )
365 1 while ${$_[0]} =~ s{
367 ( [^>]* (?: < | $ ) )
370 encode_html_quote => sub {
371 ${$_[0]} =~ s/"/"/g; },
372 breakHtml_ifwhitefix => sub {
373 ${$_[0]} = break_html(${$_[0]})
374 unless $action_data{no_white_fix}; },
375 processCustomTagsPre => sub {
376 ${$_[0]} = process_custom_tags_pre(${$_[0]}); },
377 processCustomTagsPost => sub {
378 ${$_[0]} = process_custom_tags_post(${$_[0]}); },
380 ${$_[0]} =~ s/<(.*?)>/approve_tag($1, $_[2])/sge; },
382 ${$_[0]} = url_to_html(${$_[0]}); },
383 approveCharrefs => sub {
386 }{approve_charref($1)}gex; },
387 space_between_tags => sub {
388 ${$_[0]} =~ s/></> </g; },
389 whitespace_tagify => sub {
390 ${$_[0]} =~ s/\n/<br>/gi; # pp breaks
391 ${$_[0]} =~ s/(?:<br>\s*){2,}<br>/<br><br>/gi;
392 # Preserve leading indents / spaces
393 # can mess up internal tabs, oh well
394 ${$_[0]} =~ s/\t/ /g; },
395 paragraph_wrap => sub {
396 # start off the text with a <p>!
397 ${$_[0]} = '<p>' . ${$_[0]} unless ${$_[0]} =~ /^\s*<p>/s;
398 # this doesn't assume there will be only two BRs,
399 # but it does come after whitespace_tagify, so
400 # chances are, will be only two BRs in a row
401 ${$_[0]} =~ s/(?:<br>){2}/<p>/g;
402 # make sure we don't end with a <br><p> or <br>
403 ${$_[0]} =~ s/<br>(<p>|$)/$1/g; },
404 whitespace_and_tt => sub {
405 ${$_[0]} =~ s{((?: )+)(?: (\S))?} {
406 (" " x (length($1)/2)) .
407 (defined($2) ? " $2" : "")
409 ${$_[0]} = "<tt>${$_[0]}</tt>"; },
410 newline_indent => sub {
411 ${$_[0]} =~ s{<br>\n?( +)} {
412 "<br>\n" . (' ' x length($1))
415 ${$_[0]} =~ s/<.*?>//gs; },
418 ${$_[0]} =~ s/>//g; },
419 remove_trailing_lts => sub {
420 ${$_[0]} =~ s/<(?!.*?>)//gs; },
421 remove_newlines => sub {
422 ${$_[0]} =~ s/\n+//g; },
433 processCustomTagsPost
435 encode_html_ltgt_stray
436 encode_html_amp_ifnotent
448 processCustomTagsPost
450 encode_html_ltgt_stray
451 encode_html_amp_ifnotent
453 breakHtml_ifwhitefix )],
461 breakHtml_ifwhitefix )],
473 my ($str, $fmode, @rest) = @_;
475 return "" if !defined $str;
476 return if !defined $fmode;
478 $action_data{no_white_fix} = $opts->{no_white_fix} || 0;
480 my @actions = @{$mode_actions{$fmode}};
481 for my $action (@actions) {
482 $actions{$action}->(\$str, $fmode, $opts);
488 my($string, $mode, @args) = @_;
489 return "" if !$mode || $mode < 1 || $mode > 4; # user-supplied modes are 1-4
490 return strip_by_mode($string, $mode, @args);
494 return strip_by_mode(shift, CODE);
497 sub contextual_strip {
498 my ($text, $max_chars, $min_chars) = @_;
500 return if !$max_chars;
504 my @tmp = split(/(。)/, $text);
511 push @sentences, $prev if $prev;
515 push @sentences, $prev;
518 if (@sentences == 0) {
521 elsif (@sentences == 1) {
522 $result = $sentences[0];
525 for my $sentence (@sentences) {
526 my $tmp = $result . $sentence;
527 if (length($tmp) > $max_chars) {
528 last if (length($tmp) > $min_chars);
535 if (length($result) > $max_chars) {
536 $result = substr($result, 0, $max_chars);
537 $result = $result . " ...";
545 =head2 tidy_html($html)
547 tidy HTML, then returns result.
553 my $tidy = HTML::Tidy->new( {
556 'show-body-only' => 1,
558 my $tidyed = $tidy->clean($html);
563 =head2 escape_html(\%allowed, $html)
565 escape HTML, then returns result.
570 my ($allowed, $html) = @_;
571 return EscapeHTML::escape($allowed, $html);
574 =head2 clean_html(\%allowed, $html)
576 escape and tidy HTML, then returns result.
581 my ($allowed, $html) = @_;
582 my $escaped = escape_html($allowed, $html);
583 $escaped = EscapeHTML::blank_line_to_paragraph($escaped);
585 my $tidy = HTML::Tidy->new( {
588 'show-body-only' => 1,
590 my $tidyed = $tidy->clean($escaped);
594 =head2 escape_plaintext($text)
596 escape plaintext, then returns result.
600 sub escape_plaintext {
603 $text =~ s/(?!|lt;|gt;)/&/g;