vendor/golang.org/x/net/html/token_test.go

   1 // Copyright 2010 The Go Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style
   3 // license that can be found in the LICENSE file.
   4
   5 package html
   6
   7 import (
   8         "bytes"
   9         "io"
  10         "io/ioutil"
  11         "reflect"
  12         "runtime"
  13         "strings"
  14         "testing"
  15 )
  16
  17 type tokenTest struct {
  18         // A short description of the test case.
  19         desc string
  20         // The HTML to parse.
  21         html string
  22         // The string representations of the expected tokens, joined by '$'.
  23         golden string
  24 }
  25
  26 var tokenTests = []tokenTest{
  27         {
  28                 "empty",
  29                 "",
  30                 "",
  31         },
  32         // A single text node. The tokenizer should not break text nodes on whitespace,
  33         // nor should it normalize whitespace within a text node.
  34         {
  35                 "text",
  36                 "foo  bar",
  37                 "foo  bar",
  38         },
  39         // An entity.
  40         {
  41                 "entity",
  42                 "one &lt; two",
  43                 "one &lt; two",
  44         },
  45         // A start, self-closing and end tag. The tokenizer does not care if the start
  46         // and end tokens don't match; that is the job of the parser.
  47         {
  48                 "tags",
  49                 "<a>b<c/>d</e>",
  50                 "<a>$b$<c/>$d$</e>",
  51         },
  52         // Angle brackets that aren't a tag.
  53         {
  54                 "not a tag #0",
  55                 "<",
  56                 "&lt;",
  57         },
  58         {
  59                 "not a tag #1",
  60                 "</",
  61                 "&lt;/",
  62         },
  63         {
  64                 "not a tag #2",
  65                 "</>",
  66                 "<!---->",
  67         },
  68         {
  69                 "not a tag #3",
  70                 "a</>b",
  71                 "a$<!---->$b",
  72         },
  73         {
  74                 "not a tag #4",
  75                 "</ >",
  76                 "<!-- -->",
  77         },
  78         {
  79                 "not a tag #5",
  80                 "</.",
  81                 "<!--.-->",
  82         },
  83         {
  84                 "not a tag #6",
  85                 "</.>",
  86                 "<!--.-->",
  87         },
  88         {
  89                 "not a tag #7",
  90                 "a < b",
  91                 "a &lt; b",
  92         },
  93         {
  94                 "not a tag #8",
  95                 "<.>",
  96                 "&lt;.&gt;",
  97         },
  98         {
  99                 "not a tag #9",
 100                 "a<<<b>>>c",
 101                 "a&lt;&lt;$<b>$&gt;&gt;c",
 102         },
 103         {
 104                 "not a tag #10",
 105                 "if x<0 and y < 0 then x*y>0",
 106                 "if x&lt;0 and y &lt; 0 then x*y&gt;0",
 107         },
 108         {
 109                 "not a tag #11",
 110                 "<<p>",
 111                 "&lt;$<p>",
 112         },
 113         // EOF in a tag name.
 114         {
 115                 "tag name eof #0",
 116                 "<a",
 117                 "",
 118         },
 119         {
 120                 "tag name eof #1",
 121                 "<a ",
 122                 "",
 123         },
 124         {
 125                 "tag name eof #2",
 126                 "a<b",
 127                 "a",
 128         },
 129         {
 130                 "tag name eof #3",
 131                 "<a><b",
 132                 "<a>",
 133         },
 134         {
 135                 "tag name eof #4",
 136                 `<a x`,
 137                 ``,
 138         },
 139         // Some malformed tags that are missing a '>'.
 140         {
 141                 "malformed tag #0",
 142                 `<p</p>`,
 143                 `<p< p="">`,
 144         },
 145         {
 146                 "malformed tag #1",
 147                 `<p </p>`,
 148                 `<p <="" p="">`,
 149         },
 150         {
 151                 "malformed tag #2",
 152                 `<p id`,
 153                 ``,
 154         },
 155         {
 156                 "malformed tag #3",
 157                 `<p id=`,
 158                 ``,
 159         },
 160         {
 161                 "malformed tag #4",
 162                 `<p id=>`,
 163                 `<p id="">`,
 164         },
 165         {
 166                 "malformed tag #5",
 167                 `<p id=0`,
 168                 ``,
 169         },
 170         {
 171                 "malformed tag #6",
 172                 `<p id=0</p>`,
 173                 `<p id="0&lt;/p">`,
 174         },
 175         {
 176                 "malformed tag #7",
 177                 `<p id="0</p>`,
 178                 ``,
 179         },
 180         {
 181                 "malformed tag #8",
 182                 `<p id="0"</p>`,
 183                 `<p id="0" <="" p="">`,
 184         },
 185         {
 186                 "malformed tag #9",
 187                 `<p></p id`,
 188                 `<p>`,
 189         },
 190         // Raw text and RCDATA.
 191         {
 192                 "basic raw text",
 193                 "<script><a></b></script>",
 194                 "<script>$&lt;a&gt;&lt;/b&gt;$</script>",
 195         },
 196         {
 197                 "unfinished script end tag",
 198                 "<SCRIPT>a</SCR",
 199                 "<script>$a&lt;/SCR",
 200         },
 201         {
 202                 "broken script end tag",
 203                 "<SCRIPT>a</SCR ipt>",
 204                 "<script>$a&lt;/SCR ipt&gt;",
 205         },
 206         {
 207                 "EOF in script end tag",
 208                 "<SCRIPT>a</SCRipt",
 209                 "<script>$a&lt;/SCRipt",
 210         },
 211         {
 212                 "scriptx end tag",
 213                 "<SCRIPT>a</SCRiptx",
 214                 "<script>$a&lt;/SCRiptx",
 215         },
 216         {
 217                 "' ' completes script end tag",
 218                 "<SCRIPT>a</SCRipt ",
 219                 "<script>$a",
 220         },
 221         {
 222                 "'>' completes script end tag",
 223                 "<SCRIPT>a</SCRipt>",
 224                 "<script>$a$</script>",
 225         },
 226         {
 227                 "self-closing script end tag",
 228                 "<SCRIPT>a</SCRipt/>",
 229                 "<script>$a$</script>",
 230         },
 231         {
 232                 "nested script tag",
 233                 "<SCRIPT>a</SCRipt<script>",
 234                 "<script>$a&lt;/SCRipt&lt;script&gt;",
 235         },
 236         {
 237                 "script end tag after unfinished",
 238                 "<SCRIPT>a</SCRipt</script>",
 239                 "<script>$a&lt;/SCRipt$</script>",
 240         },
 241         {
 242                 "script/style mismatched tags",
 243                 "<script>a</style>",
 244                 "<script>$a&lt;/style&gt;",
 245         },
 246         {
 247                 "style element with entity",
 248                 "<style>&apos;",
 249                 "<style>$&amp;apos;",
 250         },
 251         {
 252                 "textarea with tag",
 253                 "<textarea><div></textarea>",
 254                 "<textarea>$&lt;div&gt;$</textarea>",
 255         },
 256         {
 257                 "title with tag and entity",
 258                 "<title><b>K&amp;R C</b></title>",
 259                 "<title>$&lt;b&gt;K&amp;R C&lt;/b&gt;$</title>",
 260         },
 261         // DOCTYPE tests.
 262         {
 263                 "Proper DOCTYPE",
 264                 "<!DOCTYPE html>",
 265                 "<!DOCTYPE html>",
 266         },
 267         {
 268                 "DOCTYPE with no space",
 269                 "<!doctypehtml>",
 270                 "<!DOCTYPE html>",
 271         },
 272         {
 273                 "DOCTYPE with two spaces",
 274                 "<!doctype  html>",
 275                 "<!DOCTYPE html>",
 276         },
 277         {
 278                 "looks like DOCTYPE but isn't",
 279                 "<!DOCUMENT html>",
 280                 "<!--DOCUMENT html-->",
 281         },
 282         {
 283                 "DOCTYPE at EOF",
 284                 "<!DOCtype",
 285                 "<!DOCTYPE >",
 286         },
 287         // XML processing instructions.
 288         {
 289                 "XML processing instruction",
 290                 "<?xml?>",
 291                 "<!--?xml?-->",
 292         },
 293         // Comments.
 294         {
 295                 "comment0",
 296                 "abc<b><!-- skipme --></b>def",
 297                 "abc$<b>$<!-- skipme -->$</b>$def",
 298         },
 299         {
 300                 "comment1",
 301                 "a<!-->z",
 302                 "a$<!---->$z",
 303         },
 304         {
 305                 "comment2",
 306                 "a<!--->z",
 307                 "a$<!---->$z",
 308         },
 309         {
 310                 "comment3",
 311                 "a<!--x>-->z",
 312                 "a$<!--x>-->$z",
 313         },
 314         {
 315                 "comment4",
 316                 "a<!--x->-->z",
 317                 "a$<!--x->-->$z",
 318         },
 319         {
 320                 "comment5",
 321                 "a<!>z",
 322                 "a$<!---->$z",
 323         },
 324         {
 325                 "comment6",
 326                 "a<!->z",
 327                 "a$<!----->$z",
 328         },
 329         {
 330                 "comment7",
 331                 "a<!---<>z",
 332                 "a$<!---<>z-->",
 333         },
 334         {
 335                 "comment8",
 336                 "a<!--z",
 337                 "a$<!--z-->",
 338         },
 339         {
 340                 "comment9",
 341                 "a<!--z-",
 342                 "a$<!--z-->",
 343         },
 344         {
 345                 "comment10",
 346                 "a<!--z--",
 347                 "a$<!--z-->",
 348         },
 349         {
 350                 "comment11",
 351                 "a<!--z---",
 352                 "a$<!--z--->",
 353         },
 354         {
 355                 "comment12",
 356                 "a<!--z----",
 357                 "a$<!--z---->",
 358         },
 359         {
 360                 "comment13",
 361                 "a<!--x--!>z",
 362                 "a$<!--x-->$z",
 363         },
 364         // An attribute with a backslash.
 365         {
 366                 "backslash",
 367                 `<p id="a\"b">`,
 368                 `<p id="a\" b"="">`,
 369         },
 370         // Entities, tag name and attribute key lower-casing, and whitespace
 371         // normalization within a tag.
 372         {
 373                 "tricky",
 374                 "<p \t\n iD=\"a&quot;B\"  foo=\"bar\"><EM>te&lt;&amp;;xt</em></p>",
 375                 `<p id="a&#34;B" foo="bar">$<em>$te&lt;&amp;;xt$</em>$</p>`,
 376         },
 377         // A nonexistent entity. Tokenizing and converting back to a string should
 378         // escape the "&" to become "&amp;".
 379         {
 380                 "noSuchEntity",
 381                 `<a b="c&noSuchEntity;d">&lt;&alsoDoesntExist;&`,
 382                 `<a b="c&amp;noSuchEntity;d">$&lt;&amp;alsoDoesntExist;&amp;`,
 383         },
 384         {
 385                 "entity without semicolon",
 386                 `&notit;&notin;<a b="q=z&amp=5&notice=hello&not;=world">`,
 387                 `¬it;∉$<a b="q=z&amp;amp=5&amp;notice=hello¬=world">`,
 388         },
 389         {
 390                 "entity with digits",
 391                 "&frac12;",
 392                 "½",
 393         },
 394         // Attribute tests:
 395         // http://dev.w3.org/html5/pf-summary/Overview.html#attributes
 396         {
 397                 "Empty attribute",
 398                 `<input disabled FOO>`,
 399                 `<input disabled="" foo="">`,
 400         },
 401         {
 402                 "Empty attribute, whitespace",
 403                 `<input disabled FOO >`,
 404                 `<input disabled="" foo="">`,
 405         },
 406         {
 407                 "Unquoted attribute value",
 408                 `<input value=yes FOO=BAR>`,
 409                 `<input value="yes" foo="BAR">`,
 410         },
 411         {
 412                 "Unquoted attribute value, spaces",
 413                 `<input value = yes FOO = BAR>`,
 414                 `<input value="yes" foo="BAR">`,
 415         },
 416         {
 417                 "Unquoted attribute value, trailing space",
 418                 `<input value=yes FOO=BAR >`,
 419                 `<input value="yes" foo="BAR">`,
 420         },
 421         {
 422                 "Single-quoted attribute value",
 423                 `<input value='yes' FOO='BAR'>`,
 424                 `<input value="yes" foo="BAR">`,
 425         },
 426         {
 427                 "Single-quoted attribute value, trailing space",
 428                 `<input value='yes' FOO='BAR' >`,
 429                 `<input value="yes" foo="BAR">`,
 430         },
 431         {
 432                 "Double-quoted attribute value",
 433                 `<input value="I'm an attribute" FOO="BAR">`,
 434                 `<input value="I&#39;m an attribute" foo="BAR">`,
 435         },
 436         {
 437                 "Attribute name characters",
 438                 `<meta http-equiv="content-type">`,
 439                 `<meta http-equiv="content-type">`,
 440         },
 441         {
 442                 "Mixed attributes",
 443                 `a<P V="0 1" w='2' X=3 y>z`,
 444                 `a$<p v="0 1" w="2" x="3" y="">$z`,
 445         },
 446         {
 447                 "Attributes with a solitary single quote",
 448                 `<p id=can't><p id=won't>`,
 449                 `<p id="can&#39;t">$<p id="won&#39;t">`,
 450         },
 451 }
 452
 453 func TestTokenizer(t *testing.T) {
 454 loop:
 455         for _, tt := range tokenTests {
 456                 z := NewTokenizer(strings.NewReader(tt.html))
 457                 if tt.golden != "" {
 458                         for i, s := range strings.Split(tt.golden, "$") {
 459                                 if z.Next() == ErrorToken {
 460                                         t.Errorf("%s token %d: want %q got error %v", tt.desc, i, s, z.Err())
 461                                         continue loop
 462                                 }
 463                                 actual := z.Token().String()
 464                                 if s != actual {
 465                                         t.Errorf("%s token %d: want %q got %q", tt.desc, i, s, actual)
 466                                         continue loop
 467                                 }
 468                         }
 469                 }
 470                 z.Next()
 471                 if z.Err() != io.EOF {
 472                         t.Errorf("%s: want EOF got %q", tt.desc, z.Err())
 473                 }
 474         }
 475 }
 476
 477 func TestMaxBuffer(t *testing.T) {
 478         // Exceeding the maximum buffer size generates ErrBufferExceeded.
 479         z := NewTokenizer(strings.NewReader("<" + strings.Repeat("t", 10)))
 480         z.SetMaxBuf(5)
 481         tt := z.Next()
 482         if got, want := tt, ErrorToken; got != want {
 483                 t.Fatalf("token type: got: %v want: %v", got, want)
 484         }
 485         if got, want := z.Err(), ErrBufferExceeded; got != want {
 486                 t.Errorf("error type: got: %v want: %v", got, want)
 487         }
 488         if got, want := string(z.Raw()), "<tttt"; got != want {
 489                 t.Fatalf("buffered before overflow: got: %q want: %q", got, want)
 490         }
 491 }
 492
 493 func TestMaxBufferReconstruction(t *testing.T) {
 494         // Exceeding the maximum buffer size at any point while tokenizing permits
 495         // reconstructing the original input.
 496 tests:
 497         for _, test := range tokenTests {
 498                 for maxBuf := 1; ; maxBuf++ {
 499                         r := strings.NewReader(test.html)
 500                         z := NewTokenizer(r)
 501                         z.SetMaxBuf(maxBuf)
 502                         var tokenized bytes.Buffer
 503                         for {
 504                                 tt := z.Next()
 505                                 tokenized.Write(z.Raw())
 506                                 if tt == ErrorToken {
 507                                         if err := z.Err(); err != io.EOF && err != ErrBufferExceeded {
 508                                                 t.Errorf("%s: unexpected error: %v", test.desc, err)
 509                                         }
 510                                         break
 511                                 }
 512                         }
 513                         // Anything tokenized along with untokenized input or data left in the reader.
 514                         assembled, err := ioutil.ReadAll(io.MultiReader(&tokenized, bytes.NewReader(z.Buffered()), r))
 515                         if err != nil {
 516                                 t.Errorf("%s: ReadAll: %v", test.desc, err)
 517                                 continue tests
 518                         }
 519                         if got, want := string(assembled), test.html; got != want {
 520                                 t.Errorf("%s: reassembled html:\n got: %q\nwant: %q", test.desc, got, want)
 521                                 continue tests
 522                         }
 523                         // EOF indicates that we completed tokenization and hence found the max
 524                         // maxBuf that generates ErrBufferExceeded, so continue to the next test.
 525                         if z.Err() == io.EOF {
 526                                 break
 527                         }
 528                 } // buffer sizes
 529         } // tests
 530 }
 531
 532 func TestPassthrough(t *testing.T) {
 533         // Accumulating the raw output for each parse event should reconstruct the
 534         // original input.
 535         for _, test := range tokenTests {
 536                 z := NewTokenizer(strings.NewReader(test.html))
 537                 var parsed bytes.Buffer
 538                 for {
 539                         tt := z.Next()
 540                         parsed.Write(z.Raw())
 541                         if tt == ErrorToken {
 542                                 break
 543                         }
 544                 }
 545                 if got, want := parsed.String(), test.html; got != want {
 546                         t.Errorf("%s: parsed output:\n got: %q\nwant: %q", test.desc, got, want)
 547                 }
 548         }
 549 }
 550
 551 func TestBufAPI(t *testing.T) {
 552         s := "0<a>1</a>2<b>3<a>4<a>5</a>6</b>7</a>8<a/>9"
 553         z := NewTokenizer(bytes.NewBufferString(s))
 554         var result bytes.Buffer
 555         depth := 0
 556 loop:
 557         for {
 558                 tt := z.Next()
 559                 switch tt {
 560                 case ErrorToken:
 561                         if z.Err() != io.EOF {
 562                                 t.Error(z.Err())
 563                         }
 564                         break loop
 565                 case TextToken:
 566                         if depth > 0 {
 567                                 result.Write(z.Text())
 568                         }
 569                 case StartTagToken, EndTagToken:
 570                         tn, _ := z.TagName()
 571                         if len(tn) == 1 && tn[0] == 'a' {
 572                                 if tt == StartTagToken {
 573                                         depth++
 574                                 } else {
 575                                         depth--
 576                                 }
 577                         }
 578                 }
 579         }
 580         u := "14567"
 581         v := string(result.Bytes())
 582         if u != v {
 583                 t.Errorf("TestBufAPI: want %q got %q", u, v)
 584         }
 585 }
 586
 587 func TestConvertNewlines(t *testing.T) {
 588         testCases := map[string]string{
 589                 "Mac\rDOS\r\nUnix\n":    "Mac\nDOS\nUnix\n",
 590                 "Unix\nMac\rDOS\r\n":    "Unix\nMac\nDOS\n",
 591                 "DOS\r\nDOS\r\nDOS\r\n": "DOS\nDOS\nDOS\n",
 592                 "":         "",
 593                 "\n":       "\n",
 594                 "\n\r":     "\n\n",
 595                 "\r":       "\n",
 596                 "\r\n":     "\n",
 597                 "\r\n\n":   "\n\n",
 598                 "\r\n\r":   "\n\n",
 599                 "\r\n\r\n": "\n\n",
 600                 "\r\r":     "\n\n",
 601                 "\r\r\n":   "\n\n",
 602                 "\r\r\n\n": "\n\n\n",
 603                 "\r\r\r\n": "\n\n\n",
 604                 "\r \n":    "\n \n",
 605                 "xyz":      "xyz",
 606         }
 607         for in, want := range testCases {
 608                 if got := string(convertNewlines([]byte(in))); got != want {
 609                         t.Errorf("input %q: got %q, want %q", in, got, want)
 610                 }
 611         }
 612 }
 613
 614 func TestReaderEdgeCases(t *testing.T) {
 615         const s = "<p>An io.Reader can return (0, nil) or (n, io.EOF).</p>"
 616         testCases := []io.Reader{
 617                 &zeroOneByteReader{s: s},
 618                 &eofStringsReader{s: s},
 619                 &stuckReader{},
 620         }
 621         for i, tc := range testCases {
 622                 got := []TokenType{}
 623                 z := NewTokenizer(tc)
 624                 for {
 625                         tt := z.Next()
 626                         if tt == ErrorToken {
 627                                 break
 628                         }
 629                         got = append(got, tt)
 630                 }
 631                 if err := z.Err(); err != nil && err != io.EOF {
 632                         if err != io.ErrNoProgress {
 633                                 t.Errorf("i=%d: %v", i, err)
 634                         }
 635                         continue
 636                 }
 637                 want := []TokenType{
 638                         StartTagToken,
 639                         TextToken,
 640                         EndTagToken,
 641                 }
 642                 if !reflect.DeepEqual(got, want) {
 643                         t.Errorf("i=%d: got %v, want %v", i, got, want)
 644                         continue
 645                 }
 646         }
 647 }
 648
 649 // zeroOneByteReader is like a strings.Reader that alternates between
 650 // returning 0 bytes and 1 byte at a time.
 651 type zeroOneByteReader struct {
 652         s string
 653         n int
 654 }
 655
 656 func (r *zeroOneByteReader) Read(p []byte) (int, error) {
 657         if len(p) == 0 {
 658                 return 0, nil
 659         }
 660         if len(r.s) == 0 {
 661                 return 0, io.EOF
 662         }
 663         r.n++
 664         if r.n%2 != 0 {
 665                 return 0, nil
 666         }
 667         p[0], r.s = r.s[0], r.s[1:]
 668         return 1, nil
 669 }
 670
 671 // eofStringsReader is like a strings.Reader but can return an (n, err) where
 672 // n > 0 && err != nil.
 673 type eofStringsReader struct {
 674         s string
 675 }
 676
 677 func (r *eofStringsReader) Read(p []byte) (int, error) {
 678         n := copy(p, r.s)
 679         r.s = r.s[n:]
 680         if r.s != "" {
 681                 return n, nil
 682         }
 683         return n, io.EOF
 684 }
 685
 686 // stuckReader is an io.Reader that always returns no data and no error.
 687 type stuckReader struct{}
 688
 689 func (*stuckReader) Read(p []byte) (int, error) {
 690         return 0, nil
 691 }
 692
 693 const (
 694         rawLevel = iota
 695         lowLevel
 696         highLevel
 697 )
 698
 699 func benchmarkTokenizer(b *testing.B, level int) {
 700         buf, err := ioutil.ReadFile("testdata/go1.html")
 701         if err != nil {
 702                 b.Fatalf("could not read testdata/go1.html: %v", err)
 703         }
 704         b.SetBytes(int64(len(buf)))
 705         runtime.GC()
 706         b.ReportAllocs()
 707         b.ResetTimer()
 708         for i := 0; i < b.N; i++ {
 709                 z := NewTokenizer(bytes.NewBuffer(buf))
 710                 for {
 711                         tt := z.Next()
 712                         if tt == ErrorToken {
 713                                 if err := z.Err(); err != nil && err != io.EOF {
 714                                         b.Fatalf("tokenizer error: %v", err)
 715                                 }
 716                                 break
 717                         }
 718                         switch level {
 719                         case rawLevel:
 720                                 // Calling z.Raw just returns the raw bytes of the token. It does
 721                                 // not unescape &lt; to <, or lower-case tag names and attribute keys.
 722                                 z.Raw()
 723                         case lowLevel:
 724                                 // Caling z.Text, z.TagName and z.TagAttr returns []byte values
 725                                 // whose contents may change on the next call to z.Next.
 726                                 switch tt {
 727                                 case TextToken, CommentToken, DoctypeToken:
 728                                         z.Text()
 729                                 case StartTagToken, SelfClosingTagToken:
 730                                         _, more := z.TagName()
 731                                         for more {
 732                                                 _, _, more = z.TagAttr()
 733                                         }
 734                                 case EndTagToken:
 735                                         z.TagName()
 736                                 }
 737                         case highLevel:
 738                                 // Calling z.Token converts []byte values to strings whose validity
 739                                 // extend beyond the next call to z.Next.
 740                                 z.Token()
 741                         }
 742                 }
 743         }
 744 }
 745
 746 func BenchmarkRawLevelTokenizer(b *testing.B)  { benchmarkTokenizer(b, rawLevel) }
 747 func BenchmarkLowLevelTokenizer(b *testing.B)  { benchmarkTokenizer(b, lowLevel) }
 748 func BenchmarkHighLevelTokenizer(b *testing.B) { benchmarkTokenizer(b, highLevel) }