src/newslash_web/lib/Newslash/Util/TextFormatter.pm

   1 # This code is a part of Slash, and is released under the GPL.
   2 # Copyright 1997-2009 by Geeknet, Inc. See README
   3 # and COPYING for more information, or see http://slashcode.com/.
   4
   5 package Newslash::Util::TextFormatter;
   6
   7 use strict;
   8 use warnings;
   9 use utf8;
  10 use feature ':5.10';
  11
  12 use Exporter 'import';
  13 our @EXPORT_OK = qw(strip_by_mode);
  14
  15 use EscapeHTML qw();
  16 use HTML::Tidy;
  17 use Data::Dumper;
  18 use Encode;
  19
  20 use constant ANCHOR    => -4;
  21 use constant NOTAGS    => -3;
  22 use constant ATTRIBUTE => -2;
  23 use constant LITERAL   => -1;
  24 use constant NOHTML    => 0;
  25 use constant PLAINTEXT => 1;
  26 use constant HTML      => 2;
  27 use constant EXTRANS   => 3;
  28 use constant CODE      => 4;
  29
  30 use constant APPROVED_URL_SCHEMES_REGEX => qr/(?-xism:(?:ftp|http|gopher|mailto|news|nntp|telnet|wais|https))/;
  31
  32 #========================================================================
  33
  34 =head2 breakHtml(TEXT, MAX_WORD_LENGTH)
  35
  36 Private function.  Break up long words in some text.  Will ignore the
  37 contents of HTML tags.  Called from C<stripByMode> functions -- if
  38 there are any HTML tags in the text, C<stripBadHtml> will have been
  39 called first.  Handles spaces before dot-words so as to best work around a
  40 Microsoft bug.  This code largely contributed by Joe Groff <joe at pknet
  41 dot com>.
  42
  43 =over 4
  44
  45 =item Parameters
  46
  47 =over 4
  48
  49 =item TEXT
  50
  51 The text to be fixed.
  52
  53 =item MAX_WORD_LENGTH
  54
  55 The maximum length of a word.  Default is 50 (breakhtml_wordlength in vars).
  56
  57 =back
  58
  59 =item Return value
  60
  61 The text.
  62
  63 =back
  64
  65 =cut
  66
  67 use constant BREAK_HTML_WORDLENGTH => 8000;
  68 use constant COMMENT_STARTWORD_WORKAROUND => 0;
  69 use constant TWEAK_JAPANESE => 1;
  70 use constant APPROVEDTAGS_BREAK => ['p','br','ol','ul','li','dl','dt',
  71                                     'dd','blockquote','div','img','hr',
  72                                     'h1','h2','h3','h4','h5','h6'
  73                                    ];
  74 use constant COMMENT_NONSTARTWORDCHARS_REGEX =>
  75   qr{(?i-xsm:(\s+)((?:<[^>]+>)*(?:[\.\,\;\:\\/]|&#(?:46|x2e|44|x2c|59|x3b|58|x3a|47|x2f);)+)(\S))};
  76 use constant APPROVEDTAGS => [ 'b','i','p','br','a','ol','ul',
  77                                'li','dl','dt','dd','em','strong',
  78                                'tt','blockquote','div','ecode','del',
  79                                'ins','sub','sup','quote','strike'
  80                              ];
  81 use constant APPROVEDTAGS_ATTR => {
  82                                   'ol' => { 'start' => {'ord' => 1} },
  83                                   'a' => {
  84                                           'href' => { 'req' => 1,
  85                                                       'ord' => 1,
  86                                                       'url' => 1,
  87                                                     }
  88                                          },
  89                                   'blockquote' => { 'title' => {'ord' => 2},
  90                                                     'cite' => {'ord' => 1,'url' => 1}
  91                                                   },
  92                                   'img' => { 'width' => {'ord' => 3},
  93                                              'alt' => {'req' => 2,'ord' => 2},
  94                                              'longdesc' => {'ord' => 5,'url' => 1},
  95                                              'src' => {'req' => 1,'ord' => 1,'url' => 1},
  96                                              'height' => {'ord' => 4},
  97                                            },
  98                                  };
  99
 100 sub approve_tag {
 101     my ($whole_tag, $opts) = shift;
 102     my $allowed_tags = $opts->{allowed_tags} || {};
 103     my $r = EscapeHTML::escape_tag($allowed_tags, $whole_tag);
 104     return "<$r>";
 105 };
 106
 107 sub approve_charref {
 108     my $char_ref = shift;
 109     return $char_ref;
 110 }
 111
 112 sub url_to_html {
 113     my $text = shift;
 114     return '' if !defined $text || $text eq '';
 115
 116     my $scheme_regex = APPROVED_URL_SCHEMES_REGEX;
 117
 118     # we know this can break real URLs, but probably will
 119     # preserve real URLs more often than it will break them
 120     # was ['":=>]
 121     # should we parse the HTML instead?  problematic ...
 122     $text =~  s{(?<!\S)((?:$scheme_regex):/{0,2}[$URI::uric#]+)}{
 123         #my $url   = fudgeurl($1);
 124         my $url = $1;
 125         my $extra = '';
 126         $extra = $1 if $url =~ s/([?!;:.,']+)$//;
 127         $extra = ')' . $extra if $url !~ /\(/ && $url =~ s/\)$//;
 128         #print STDERR "url2html s/// url='$url' extra='$extra'\n" if !defined($url) || !defined($extra);
 129         qq[<a href="$url" rel="url2html-$$">$url</a>$extra];
 130     }ogie;
 131     # url2html-$$ is so we can remove the whole thing later for ecode
 132
 133     return $text;
 134 }
 135
 136 sub break_html {
 137     my($text, $mwl) = @_;
 138     $mwl = $mwl || BREAK_HTML_WORDLENGTH;
 139
 140     # Only do the <NOBR> and <WBR> bug workaround if wanted.
 141     my $workaround_start = COMMENT_STARTWORD_WORKAROUND
 142       ? "<nobr>" : "";
 143     my $workaround_end = COMMENT_STARTWORD_WORKAROUND
 144       ? "<wbr></nobr> " : " ";
 145
 146     if (TWEAK_JAPANESE) {
 147         $workaround_start = "";
 148         $workaround_end = "";
 149     }
 150
 151     # These are tags that "break" a word;
 152     # a<P>b</P> breaks words, y<B>z</B> does not
 153     my $approvedtags_break = APPROVEDTAGS_BREAK || [];
 154     my $break_tag = join '|', @$approvedtags_break;
 155     $break_tag = qr{(?:$break_tag)}i;
 156
 157     # This is the regex that finds a char that, at the start of
 158     # a word, will trigger Microsoft's bug.  It's already been
 159     # set up for us, it just needs a shorter name.
 160     my $nswcr = COMMENT_NONSTARTWORDCHARS_REGEX;
 161
 162     # And we also need a regex that will find an HTML entity or
 163     # character references, excluding ones that would break words:
 164     # a non-breaking entity.  For now, let's assume *all* entities
 165     # are non-breaking (except an encoded space which would be
 166     # kinda dumb).
 167     my $nbe = qr{ (?:
 168                       &
 169                       (?! \# (?:32|x20) )
 170                       (\#?[a-zA-Z0-9]+)
 171                       ;
 172                   ) }xi;
 173
 174     # Mark off breaking tags, as we don't want them counted as
 175     # part of long words
 176     $text =~ s{
 177                   (</?$break_tag>)
 178           }{\x00$1\x00}gsx;
 179
 180     # Temporarily hide whitespace inside tags so that the regex below
 181     # won't accidentally catch attributes, e.g. the HREF= of an A tag.
 182     # (Which I don't think it can do anyway, based on the way the
 183     # following regex gobbles <> and the fact that tags should already
 184     # be balanced by this point...but this can't hurt - Jamie)
 185     1 while $text =~ s{
 186                           (<[^>\s]*)    # Seek in a tab up to its
 187                           \s+           # first whitespace
 188                   }{$1\x00}gsx;         # and replace the space with NUL
 189
 190     # Put the <wbr> in front of attempts to exploit MSIE's
 191     # half-braindead adherance to Unicode char breaking.
 192     $text =~ s{$nswcr}{<nobr> <wbr></nobr>$2$3}gs if COMMENT_STARTWORD_WORKAROUND;
 193
 194     # Break up overlong words, treating entities/character references
 195     # as single characters and ignoring HTML tags.
 196     $text =~ s{(
 197                    (?:^|\G|\s)          # Must start at a word bound
 198                    (?:
 199                        (?>(?:<[^>]+>)*) # Eat up HTML tags
 200                        (                        # followed by either
 201                            $nbe         # an entity (char. ref.)
 202                        |        (?!$nbe)\S      # or an ordinary char
 203                        )
 204                    ){$mwl}                      # $mwl non-HTML-tag chars in a row
 205                )}{
 206                    substr($1, 0, -length($2))
 207                      . $workaround_start
 208                      . substr($1, -length($2))
 209                      . $workaround_end
 210                  }gsex;
 211
 212     # Just to be tidy, if we appended that word break at the very end
 213     # of the text, eliminate it.
 214     $text =~ s{<nobr> <wbr></nobr>\s*$}{} if COMMENT_STARTWORD_WORKAROUND;
 215
 216     # Fix breaking tags
 217     $text =~ s{
 218                   \x00
 219                   (</?$break_tag>)
 220                   \x00
 221           }{$1}gsx;
 222
 223     # Change other NULs back to whitespace.
 224     $text =~ s{\x00}{ }g;
 225
 226     return $text;
 227 }
 228
 229
 230 #========================================================================
 231
 232 =head2 processCustomTagsPre(STRING)
 233
 234 =head2 processCustomTagsPost(STRING)
 235
 236 Private function.  It does processing of special custom tags (in Pre, ECODE;
 237 in Post, QUOTE).
 238
 239 =over 4
 240
 241 =item Parameters
 242
 243 =over 4
 244
 245 =item STRING
 246
 247 String to be processed.
 248
 249 =back
 250
 251 =item Return value
 252
 253 Processed string.
 254
 255 =item Dependencies
 256
 257 Pre is meant to be used before C<approveTag> is called; Post after.
 258 Both are called only from regular posting modes, HTML and PLAINTEXT.
 259
 260 =back
 261
 262 =cut
 263
 264 sub process_custom_tags_pre {
 265     my ($str) = @_;
 266
 267         ## Deal with special ECODE tag (Embedded Code).  This tag allows
 268         ## embedding the Code postmode in plain or HTML modes.  It may be
 269         ## of the form:
 270         ##    <ECODE>literal text</ECODE>
 271         ## or, for the case where "</ECODE>" is needed in the text:
 272         ##    <ECODE END="SOMETAG">literal text</SOMETAG>
 273         ##
 274         ## SOMETAG must match /^\w+$/.
 275         ##
 276         ##
 277         ## Note that we also strip out leading and trailing newlines
 278         ## surrounding the tags, because in plain text mode this can
 279         ## be hard to manage, so we manage it for the user.
 280         ##
 281         ## Also note that this won't work if the site disallows TT
 282         ## or BLOCKQUOTE tags.
 283         ##
 284         ## -- pudge
 285
 286     # ECODE must be in approvedtags
 287         if (grep /^ecode$/i, @{APPROVEDTAGS()}) {
 288                 $str =~ s|<(/?)literal>|<${1}ecode>|gi;  # we used to accept "literal" too
 289                 my $ecode   = 'ecode';
 290                 my $open    = qr[\n* <\s* (?:$ecode) (?: \s+ END="(\w+)")? \s*> \n*]xsio;
 291                 my $close_1 = qr[($open (.*?) \n* <\s* /\2    \s*> \n*)]xsio;  # if END is used
 292                 my $close_2 = qr[($open (.*?) \n* <\s* /ECODE \s*> \n*)]xsio;  # if END is not used
 293
 294                 while ($str =~ m[($open)]g) {
 295                         my $len = length($1);
 296                         my $end = $2;
 297                         my $pos = pos($str) - $len;
 298
 299                         my $close = $end ? $close_1 : $close_2;
 300                         my $substr = substr($str, $pos);
 301                         if ($substr =~ m/^$close/si) {
 302                                 my $len = length($1);
 303                                 my $codestr = $3;
 304                                 # remove these if they were added by url2html; I know
 305                                 # this is a rather cheesy way to do this, but c'est la vie
 306                                 # -- pudge
 307                                 $codestr =~ s{<a href="[^"]+" rel="url2html-$$">(.+?)</a>}{$1}g;
 308                                 my $code = strip_code($codestr);
 309                                 my $newstr = "<p><blockquote>$code</blockquote></p>";
 310                                 substr($str, $pos, $len) = $newstr;
 311                                 pos($str) = $pos + length($newstr);
 312                         }
 313                 }
 314         }
 315         return $str;
 316 }
 317
 318 sub process_custom_tags_post {
 319         my($str) = @_;
 320
 321         # QUOTE must be in approvedtags
 322         if (grep /^quote$/i, @{APPROVEDTAGS()}) {
 323                 my $quote   = 'quote';
 324                 my $open    = qr[\n* <\s*  $quote \s*> \n*]xsio;
 325                 my $close   = qr[\n* <\s* /$quote \s*> \n*]xsio;
 326
 327                 $str =~ s/$open/<p><div class="quote">/g;
 328                 $str =~ s/$close/<\/div><\/p>/g;
 329         }
 330
 331         # just fix the whitespace for blockquote to something that looks
 332         # universally good
 333         if (grep /^blockquote$/i, @{APPROVEDTAGS()}) {
 334                 my $quote   = 'blockquote';
 335                 my $open    = qr[\s* <\s*  $quote \s*> \n*]xsio;
 336                 my $close   = qr[\s* <\s* /$quote \s*> \n*]xsio;
 337
 338                 $str =~ s/(?<!<p>)$open/<p><$quote>/g;
 339         }
 340
 341         return $str;
 342 }
 343
 344
 345
 346 my %action_data = ();
 347
 348 my %actions = (
 349         newline_to_local => sub {
 350                         ${$_[0]} =~ s/(?:\015?\012|\015)/\n/g;          },
 351         trailing_whitespace => sub {
 352                         ${$_[0]} =~ s/[\t ]+\n/\n/g;                    },
 353         encode_html_amp => sub {
 354                         ${$_[0]} =~ s/&/&amp;/g;                        },
 355         encode_html_amp_ifnotent => sub {
 356                         ${$_[0]} =~ s/&(?!#?[a-zA-Z0-9]+;)/&amp;/g;     },
 357         encode_html_ltgt => sub {
 358                         ${$_[0]} =~ s/</&lt;/g;
 359                         ${$_[0]} =~ s/>/&gt;/g;                         },
 360         encode_html_ltgt_stray => sub {
 361                         1 while ${$_[0]} =~ s{
 362                                 ( (?: ^ | > ) [^<]* )
 363                                 >
 364                         }{$1&gt;}gx;
 365                         1 while ${$_[0]} =~ s{
 366                                 <
 367                                 ( [^>]* (?: < | $ ) )
 368                                 >
 369                         }{&lt;$1}gx;                                    },
 370         encode_html_quote => sub {
 371                         ${$_[0]} =~ s/"/&#34;/g;                        },
 372         breakHtml_ifwhitefix => sub {
 373                         ${$_[0]} = break_html(${$_[0]})
 374                                 unless $action_data{no_white_fix};      },
 375         processCustomTagsPre => sub {
 376                         ${$_[0]} = process_custom_tags_pre(${$_[0]});   },
 377         processCustomTagsPost => sub {
 378                         ${$_[0]} = process_custom_tags_post(${$_[0]});  },
 379         approveTags => sub {
 380                         ${$_[0]} =~ s/<(.*?)>/approve_tag($1, $_[2])/sge;       },
 381         url2html => sub {
 382                         ${$_[0]} = url_to_html(${$_[0]});                       },
 383         approveCharrefs => sub {
 384                         ${$_[0]} =~ s{
 385                                 &(\#?[a-zA-Z0-9]+);?
 386                         }{approve_charref($1)}gex;                      },
 387         space_between_tags => sub {
 388                         ${$_[0]} =~ s/></> </g;                         },
 389         whitespace_tagify => sub {
 390                         ${$_[0]} =~ s/\n/<br>/gi;  # pp breaks
 391                         ${$_[0]} =~ s/(?:<br>\s*){2,}<br>/<br><br>/gi;
 392                         # Preserve leading indents / spaces
 393                         # can mess up internal tabs, oh well
 394                         ${$_[0]} =~ s/\t/    /g;                        },
 395         paragraph_wrap => sub {
 396                         # start off the text with a <p>!
 397                         ${$_[0]} = '<p>' . ${$_[0]} unless ${$_[0]} =~ /^\s*<p>/s;
 398                         # this doesn't assume there will be only two BRs,
 399                         # but it does come after whitespace_tagify, so
 400                         # chances are, will be only two BRs in a row
 401                         ${$_[0]} =~ s/(?:<br>){2}/<p>/g;
 402                         # make sure we don't end with a <br><p> or <br>
 403                         ${$_[0]} =~ s/<br>(<p>|$)/$1/g;                 },
 404         whitespace_and_tt => sub {
 405                         ${$_[0]} =~ s{((?:  )+)(?: (\S))?} {
 406                                 ("&nbsp; " x (length($1)/2)) .
 407                                 (defined($2) ? "&nbsp;$2" : "")
 408                         }eg;
 409                         ${$_[0]} = "<tt>${$_[0]}</tt>";                 },
 410         newline_indent => sub {
 411                         ${$_[0]} =~ s{<br>\n?( +)} {
 412                                 "<br>\n" . ('&nbsp; ' x length($1))
 413                         }ieg;                                           },
 414         remove_tags => sub {
 415                         ${$_[0]} =~ s/<.*?>//gs;                        },
 416         remove_ltgt => sub {
 417                         ${$_[0]} =~ s/<//g;
 418                         ${$_[0]} =~ s/>//g;                             },
 419         remove_trailing_lts => sub {
 420                         ${$_[0]} =~ s/<(?!.*?>)//gs;                    },
 421         remove_newlines => sub {
 422                         ${$_[0]} =~ s/\n+//g;                           },
 423
 424 );
 425
 426 my %mode_actions = (
 427         PLAINTEXT, [qw(
 428                         newline_to_local
 429                         trailing_whitespace
 430                         processCustomTagsPre
 431                         remove_trailing_lts
 432                         approveTags
 433                         processCustomTagsPost
 434                         space_between_tags
 435                         encode_html_ltgt_stray
 436                         encode_html_amp_ifnotent
 437                         approveCharrefs
 438                         breakHtml_ifwhitefix
 439                         whitespace_tagify
 440                         newline_indent
 441                         paragraph_wrap                  )],
 442         HTML, [qw(
 443                         newline_to_local
 444                         trailing_whitespace
 445                         processCustomTagsPre
 446                         remove_trailing_lts
 447                         approveTags
 448                         processCustomTagsPost
 449                         space_between_tags
 450                         encode_html_ltgt_stray
 451                         encode_html_amp_ifnotent
 452                         approveCharrefs
 453                         breakHtml_ifwhitefix            )],
 454         CODE, [qw(
 455                         newline_to_local
 456                         trailing_whitespace
 457                         encode_html_amp
 458                         encode_html_ltgt
 459                         whitespace_tagify
 460                         whitespace_and_tt
 461                         breakHtml_ifwhitefix            )],
 462         EXTRANS, [qw(
 463                         newline_to_local
 464                         trailing_whitespace
 465                         encode_html_amp
 466                         encode_html_ltgt
 467                         breakHtml_ifwhitefix
 468                         whitespace_tagify
 469                         newline_indent                  )],
 470 );
 471
 472 sub strip_by_mode {
 473     my ($str, $fmode, @rest) = @_;
 474     my $opts = {@rest};
 475     return "" if !defined $str;
 476     return if !defined $fmode;
 477
 478     $action_data{no_white_fix} = $opts->{no_white_fix} || 0;
 479
 480     my @actions = @{$mode_actions{$fmode}};
 481     for my $action (@actions) {
 482         $actions{$action}->(\$str, $fmode, $opts);
 483     }
 484     return $str;
 485 }
 486
 487 sub strip_mode {
 488         my($string, $mode, @args) = @_;
 489         return "" if !$mode || $mode < 1 || $mode > 4;  # user-supplied modes are 1-4
 490         return strip_by_mode($string, $mode, @args);
 491 }
 492
 493 sub strip_code {
 494     return strip_by_mode(shift, CODE);
 495 }
 496
 497 sub contextual_strip {
 498     my ($text, $max_chars, $min_chars) = @_;
 499     return if !$text;
 500     return if !$max_chars;
 501     $min_chars ||= 0;
 502
 503     # split by "。"
 504     my @tmp = split(/(。)/, $text);
 505     my @sentences;
 506     my $prev;
 507     for my $s (@tmp) {
 508         if ($s eq "。") {
 509             $prev = $prev . $s;
 510         } else {
 511             push @sentences, $prev if $prev;
 512             $prev = $s;
 513         }
 514     }
 515     push @sentences, $prev;
 516
 517     my $result = "";
 518     if (@sentences == 0) {
 519         $result = $text;
 520     }
 521     elsif (@sentences == 1) {
 522         $result = $sentences[0];
 523     }
 524     else {
 525         for my $sentence (@sentences) {
 526             my $tmp = $result . $sentence;
 527             if (length($tmp) > $max_chars) {
 528                 last if (length($tmp) > $min_chars);
 529                 $result = $tmp;
 530                 last;
 531             }
 532             $result = $tmp;
 533         }
 534     }
 535     if (length($result) > $max_chars) {
 536         $result = substr($result, 0, $max_chars);
 537         $result = $result . " ...";
 538     }
 539
 540     return $result;
 541 }
 542
 543 =head1 FUNCTIONS
 544
 545 =head2 tidy_html($html)
 546
 547   tidy HTML, then returns result.
 548
 549 =cut
 550
 551 sub tidy_html {
 552     my ($html) = @_;
 553     my $tidy = HTML::Tidy->new( {
 554                                  tidy_mark => 0,
 555                                  doctype => 'omit',
 556                                  'show-body-only' => 1,
 557                      } );
 558     my $tidyed = $tidy->clean($html);
 559     chomp($tidyed);
 560     return $tidyed;
 561 }
 562
 563 =head2 escape_html(\%allowed, $html)
 564
 565   escape HTML, then returns result.
 566
 567 =cut
 568
 569 sub escape_html {
 570     my ($allowed, $html) = @_;
 571     return EscapeHTML::escape($allowed, $html);
 572 }
 573
 574 =head2 clean_html(\%allowed, $html)
 575
 576   escape and tidy HTML, then returns result.
 577
 578 =cut
 579
 580 sub clean_html {
 581     my ($allowed, $html) = @_;
 582     my $escaped = escape_html($allowed, $html);
 583     $escaped = EscapeHTML::blank_line_to_paragraph($escaped);
 584
 585     my $tidy = HTML::Tidy->new( {
 586                                  tidy_mark => 0,
 587                                  doctype => 'omit',
 588                                  'show-body-only' => 1,
 589                      } );
 590     my $tidyed = $tidy->clean($escaped);
 591     return $tidyed;
 592 }
 593
 594 =head2 escape_plaintext($text)
 595
 596   escape plaintext, then returns result.
 597
 598 =cut
 599
 600 sub escape_plaintext {
 601     my ($text) = @_;
 602
 603     $text =~ s/(?!|lt;|gt;)/&amp;/g;
 604     $text =~ s/</&lt;/g;
 605     $text =~ s/>/&gt;/g;
 606
 607     return $text;
 608 }
 609
 610 1;
 611