OSDN Git Service

new repo
[bytom/vapor.git] / vendor / golang.org / x / net / html / token_test.go
1 // Copyright 2010 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 package html
6
7 import (
8         "bytes"
9         "io"
10         "io/ioutil"
11         "reflect"
12         "runtime"
13         "strings"
14         "testing"
15 )
16
17 type tokenTest struct {
18         // A short description of the test case.
19         desc string
20         // The HTML to parse.
21         html string
22         // The string representations of the expected tokens, joined by '$'.
23         golden string
24 }
25
26 var tokenTests = []tokenTest{
27         {
28                 "empty",
29                 "",
30                 "",
31         },
32         // A single text node. The tokenizer should not break text nodes on whitespace,
33         // nor should it normalize whitespace within a text node.
34         {
35                 "text",
36                 "foo  bar",
37                 "foo  bar",
38         },
39         // An entity.
40         {
41                 "entity",
42                 "one < two",
43                 "one < two",
44         },
45         // A start, self-closing and end tag. The tokenizer does not care if the start
46         // and end tokens don't match; that is the job of the parser.
47         {
48                 "tags",
49                 "<a>b<c/>d</e>",
50                 "<a>$b$<c/>$d$</e>",
51         },
52         // Angle brackets that aren't a tag.
53         {
54                 "not a tag #0",
55                 "<",
56                 "&lt;",
57         },
58         {
59                 "not a tag #1",
60                 "</",
61                 "&lt;/",
62         },
63         {
64                 "not a tag #2",
65                 "</>",
66                 "<!---->",
67         },
68         {
69                 "not a tag #3",
70                 "a</>b",
71                 "a$<!---->$b",
72         },
73         {
74                 "not a tag #4",
75                 "</ >",
76                 "<!-- -->",
77         },
78         {
79                 "not a tag #5",
80                 "</.",
81                 "<!--.-->",
82         },
83         {
84                 "not a tag #6",
85                 "</.>",
86                 "<!--.-->",
87         },
88         {
89                 "not a tag #7",
90                 "a < b",
91                 "a &lt; b",
92         },
93         {
94                 "not a tag #8",
95                 "<.>",
96                 "&lt;.&gt;",
97         },
98         {
99                 "not a tag #9",
100                 "a<<<b>>>c",
101                 "a&lt;&lt;$<b>$&gt;&gt;c",
102         },
103         {
104                 "not a tag #10",
105                 "if x<0 and y < 0 then x*y>0",
106                 "if x&lt;0 and y &lt; 0 then x*y&gt;0",
107         },
108         {
109                 "not a tag #11",
110                 "<<p>",
111                 "&lt;$<p>",
112         },
113         // EOF in a tag name.
114         {
115                 "tag name eof #0",
116                 "<a",
117                 "",
118         },
119         {
120                 "tag name eof #1",
121                 "<a ",
122                 "",
123         },
124         {
125                 "tag name eof #2",
126                 "a<b",
127                 "a",
128         },
129         {
130                 "tag name eof #3",
131                 "<a><b",
132                 "<a>",
133         },
134         {
135                 "tag name eof #4",
136                 `<a x`,
137                 ``,
138         },
139         // Some malformed tags that are missing a '>'.
140         {
141                 "malformed tag #0",
142                 `<p</p>`,
143                 `<p< p="">`,
144         },
145         {
146                 "malformed tag #1",
147                 `<p </p>`,
148                 `<p <="" p="">`,
149         },
150         {
151                 "malformed tag #2",
152                 `<p id`,
153                 ``,
154         },
155         {
156                 "malformed tag #3",
157                 `<p id=`,
158                 ``,
159         },
160         {
161                 "malformed tag #4",
162                 `<p id=>`,
163                 `<p id="">`,
164         },
165         {
166                 "malformed tag #5",
167                 `<p id=0`,
168                 ``,
169         },
170         {
171                 "malformed tag #6",
172                 `<p id=0</p>`,
173                 `<p id="0&lt;/p">`,
174         },
175         {
176                 "malformed tag #7",
177                 `<p id="0</p>`,
178                 ``,
179         },
180         {
181                 "malformed tag #8",
182                 `<p id="0"</p>`,
183                 `<p id="0" <="" p="">`,
184         },
185         {
186                 "malformed tag #9",
187                 `<p></p id`,
188                 `<p>`,
189         },
190         // Raw text and RCDATA.
191         {
192                 "basic raw text",
193                 "<script><a></b></script>",
194                 "<script>$&lt;a&gt;&lt;/b&gt;$</script>",
195         },
196         {
197                 "unfinished script end tag",
198                 "<SCRIPT>a</SCR",
199                 "<script>$a&lt;/SCR",
200         },
201         {
202                 "broken script end tag",
203                 "<SCRIPT>a</SCR ipt>",
204                 "<script>$a&lt;/SCR ipt&gt;",
205         },
206         {
207                 "EOF in script end tag",
208                 "<SCRIPT>a</SCRipt",
209                 "<script>$a&lt;/SCRipt",
210         },
211         {
212                 "scriptx end tag",
213                 "<SCRIPT>a</SCRiptx",
214                 "<script>$a&lt;/SCRiptx",
215         },
216         {
217                 "' ' completes script end tag",
218                 "<SCRIPT>a</SCRipt ",
219                 "<script>$a",
220         },
221         {
222                 "'>' completes script end tag",
223                 "<SCRIPT>a</SCRipt>",
224                 "<script>$a$</script>",
225         },
226         {
227                 "self-closing script end tag",
228                 "<SCRIPT>a</SCRipt/>",
229                 "<script>$a$</script>",
230         },
231         {
232                 "nested script tag",
233                 "<SCRIPT>a</SCRipt<script>",
234                 "<script>$a&lt;/SCRipt&lt;script&gt;",
235         },
236         {
237                 "script end tag after unfinished",
238                 "<SCRIPT>a</SCRipt</script>",
239                 "<script>$a&lt;/SCRipt$</script>",
240         },
241         {
242                 "script/style mismatched tags",
243                 "<script>a</style>",
244                 "<script>$a&lt;/style&gt;",
245         },
246         {
247                 "style element with entity",
248                 "<style>&apos;",
249                 "<style>$&amp;apos;",
250         },
251         {
252                 "textarea with tag",
253                 "<textarea><div></textarea>",
254                 "<textarea>$&lt;div&gt;$</textarea>",
255         },
256         {
257                 "title with tag and entity",
258                 "<title><b>K&amp;R C</b></title>",
259                 "<title>$&lt;b&gt;K&amp;R C&lt;/b&gt;$</title>",
260         },
261         // DOCTYPE tests.
262         {
263                 "Proper DOCTYPE",
264                 "<!DOCTYPE html>",
265                 "<!DOCTYPE html>",
266         },
267         {
268                 "DOCTYPE with no space",
269                 "<!doctypehtml>",
270                 "<!DOCTYPE html>",
271         },
272         {
273                 "DOCTYPE with two spaces",
274                 "<!doctype  html>",
275                 "<!DOCTYPE html>",
276         },
277         {
278                 "looks like DOCTYPE but isn't",
279                 "<!DOCUMENT html>",
280                 "<!--DOCUMENT html-->",
281         },
282         {
283                 "DOCTYPE at EOF",
284                 "<!DOCtype",
285                 "<!DOCTYPE >",
286         },
287         // XML processing instructions.
288         {
289                 "XML processing instruction",
290                 "<?xml?>",
291                 "<!--?xml?-->",
292         },
293         // Comments.
294         {
295                 "comment0",
296                 "abc<b><!-- skipme --></b>def",
297                 "abc$<b>$<!-- skipme -->$</b>$def",
298         },
299         {
300                 "comment1",
301                 "a<!-->z",
302                 "a$<!---->$z",
303         },
304         {
305                 "comment2",
306                 "a<!--->z",
307                 "a$<!---->$z",
308         },
309         {
310                 "comment3",
311                 "a<!--x>-->z",
312                 "a$<!--x>-->$z",
313         },
314         {
315                 "comment4",
316                 "a<!--x->-->z",
317                 "a$<!--x->-->$z",
318         },
319         {
320                 "comment5",
321                 "a<!>z",
322                 "a$<!---->$z",
323         },
324         {
325                 "comment6",
326                 "a<!->z",
327                 "a$<!----->$z",
328         },
329         {
330                 "comment7",
331                 "a<!---<>z",
332                 "a$<!---<>z-->",
333         },
334         {
335                 "comment8",
336                 "a<!--z",
337                 "a$<!--z-->",
338         },
339         {
340                 "comment9",
341                 "a<!--z-",
342                 "a$<!--z-->",
343         },
344         {
345                 "comment10",
346                 "a<!--z--",
347                 "a$<!--z-->",
348         },
349         {
350                 "comment11",
351                 "a<!--z---",
352                 "a$<!--z--->",
353         },
354         {
355                 "comment12",
356                 "a<!--z----",
357                 "a$<!--z---->",
358         },
359         {
360                 "comment13",
361                 "a<!--x--!>z",
362                 "a$<!--x-->$z",
363         },
364         // An attribute with a backslash.
365         {
366                 "backslash",
367                 `<p id="a\"b">`,
368                 `<p id="a\" b"="">`,
369         },
370         // Entities, tag name and attribute key lower-casing, and whitespace
371         // normalization within a tag.
372         {
373                 "tricky",
374                 "<p \t\n iD=\"a&quot;B\"  foo=\"bar\"><EM>te&lt;&amp;;xt</em></p>",
375                 `<p id="a&#34;B" foo="bar">$<em>$te&lt;&amp;;xt$</em>$</p>`,
376         },
377         // A nonexistent entity. Tokenizing and converting back to a string should
378         // escape the "&" to become "&amp;".
379         {
380                 "noSuchEntity",
381                 `<a b="c&noSuchEntity;d">&lt;&alsoDoesntExist;&`,
382                 `<a b="c&amp;noSuchEntity;d">$&lt;&amp;alsoDoesntExist;&amp;`,
383         },
384         {
385                 "entity without semicolon",
386                 `&notit;&notin;<a b="q=z&amp=5&notice=hello&not;=world">`,
387                 `¬it;∉$<a b="q=z&amp;amp=5&amp;notice=hello¬=world">`,
388         },
389         {
390                 "entity with digits",
391                 "&frac12;",
392                 "½",
393         },
394         // Attribute tests:
395         // http://dev.w3.org/html5/pf-summary/Overview.html#attributes
396         {
397                 "Empty attribute",
398                 `<input disabled FOO>`,
399                 `<input disabled="" foo="">`,
400         },
401         {
402                 "Empty attribute, whitespace",
403                 `<input disabled FOO >`,
404                 `<input disabled="" foo="">`,
405         },
406         {
407                 "Unquoted attribute value",
408                 `<input value=yes FOO=BAR>`,
409                 `<input value="yes" foo="BAR">`,
410         },
411         {
412                 "Unquoted attribute value, spaces",
413                 `<input value = yes FOO = BAR>`,
414                 `<input value="yes" foo="BAR">`,
415         },
416         {
417                 "Unquoted attribute value, trailing space",
418                 `<input value=yes FOO=BAR >`,
419                 `<input value="yes" foo="BAR">`,
420         },
421         {
422                 "Single-quoted attribute value",
423                 `<input value='yes' FOO='BAR'>`,
424                 `<input value="yes" foo="BAR">`,
425         },
426         {
427                 "Single-quoted attribute value, trailing space",
428                 `<input value='yes' FOO='BAR' >`,
429                 `<input value="yes" foo="BAR">`,
430         },
431         {
432                 "Double-quoted attribute value",
433                 `<input value="I'm an attribute" FOO="BAR">`,
434                 `<input value="I&#39;m an attribute" foo="BAR">`,
435         },
436         {
437                 "Attribute name characters",
438                 `<meta http-equiv="content-type">`,
439                 `<meta http-equiv="content-type">`,
440         },
441         {
442                 "Mixed attributes",
443                 `a<P V="0 1" w='2' X=3 y>z`,
444                 `a$<p v="0 1" w="2" x="3" y="">$z`,
445         },
446         {
447                 "Attributes with a solitary single quote",
448                 `<p id=can't><p id=won't>`,
449                 `<p id="can&#39;t">$<p id="won&#39;t">`,
450         },
451 }
452
453 func TestTokenizer(t *testing.T) {
454 loop:
455         for _, tt := range tokenTests {
456                 z := NewTokenizer(strings.NewReader(tt.html))
457                 if tt.golden != "" {
458                         for i, s := range strings.Split(tt.golden, "$") {
459                                 if z.Next() == ErrorToken {
460                                         t.Errorf("%s token %d: want %q got error %v", tt.desc, i, s, z.Err())
461                                         continue loop
462                                 }
463                                 actual := z.Token().String()
464                                 if s != actual {
465                                         t.Errorf("%s token %d: want %q got %q", tt.desc, i, s, actual)
466                                         continue loop
467                                 }
468                         }
469                 }
470                 z.Next()
471                 if z.Err() != io.EOF {
472                         t.Errorf("%s: want EOF got %q", tt.desc, z.Err())
473                 }
474         }
475 }
476
477 func TestMaxBuffer(t *testing.T) {
478         // Exceeding the maximum buffer size generates ErrBufferExceeded.
479         z := NewTokenizer(strings.NewReader("<" + strings.Repeat("t", 10)))
480         z.SetMaxBuf(5)
481         tt := z.Next()
482         if got, want := tt, ErrorToken; got != want {
483                 t.Fatalf("token type: got: %v want: %v", got, want)
484         }
485         if got, want := z.Err(), ErrBufferExceeded; got != want {
486                 t.Errorf("error type: got: %v want: %v", got, want)
487         }
488         if got, want := string(z.Raw()), "<tttt"; got != want {
489                 t.Fatalf("buffered before overflow: got: %q want: %q", got, want)
490         }
491 }
492
493 func TestMaxBufferReconstruction(t *testing.T) {
494         // Exceeding the maximum buffer size at any point while tokenizing permits
495         // reconstructing the original input.
496 tests:
497         for _, test := range tokenTests {
498                 for maxBuf := 1; ; maxBuf++ {
499                         r := strings.NewReader(test.html)
500                         z := NewTokenizer(r)
501                         z.SetMaxBuf(maxBuf)
502                         var tokenized bytes.Buffer
503                         for {
504                                 tt := z.Next()
505                                 tokenized.Write(z.Raw())
506                                 if tt == ErrorToken {
507                                         if err := z.Err(); err != io.EOF && err != ErrBufferExceeded {
508                                                 t.Errorf("%s: unexpected error: %v", test.desc, err)
509                                         }
510                                         break
511                                 }
512                         }
513                         // Anything tokenized along with untokenized input or data left in the reader.
514                         assembled, err := ioutil.ReadAll(io.MultiReader(&tokenized, bytes.NewReader(z.Buffered()), r))
515                         if err != nil {
516                                 t.Errorf("%s: ReadAll: %v", test.desc, err)
517                                 continue tests
518                         }
519                         if got, want := string(assembled), test.html; got != want {
520                                 t.Errorf("%s: reassembled html:\n got: %q\nwant: %q", test.desc, got, want)
521                                 continue tests
522                         }
523                         // EOF indicates that we completed tokenization and hence found the max
524                         // maxBuf that generates ErrBufferExceeded, so continue to the next test.
525                         if z.Err() == io.EOF {
526                                 break
527                         }
528                 } // buffer sizes
529         } // tests
530 }
531
532 func TestPassthrough(t *testing.T) {
533         // Accumulating the raw output for each parse event should reconstruct the
534         // original input.
535         for _, test := range tokenTests {
536                 z := NewTokenizer(strings.NewReader(test.html))
537                 var parsed bytes.Buffer
538                 for {
539                         tt := z.Next()
540                         parsed.Write(z.Raw())
541                         if tt == ErrorToken {
542                                 break
543                         }
544                 }
545                 if got, want := parsed.String(), test.html; got != want {
546                         t.Errorf("%s: parsed output:\n got: %q\nwant: %q", test.desc, got, want)
547                 }
548         }
549 }
550
551 func TestBufAPI(t *testing.T) {
552         s := "0<a>1</a>2<b>3<a>4<a>5</a>6</b>7</a>8<a/>9"
553         z := NewTokenizer(bytes.NewBufferString(s))
554         var result bytes.Buffer
555         depth := 0
556 loop:
557         for {
558                 tt := z.Next()
559                 switch tt {
560                 case ErrorToken:
561                         if z.Err() != io.EOF {
562                                 t.Error(z.Err())
563                         }
564                         break loop
565                 case TextToken:
566                         if depth > 0 {
567                                 result.Write(z.Text())
568                         }
569                 case StartTagToken, EndTagToken:
570                         tn, _ := z.TagName()
571                         if len(tn) == 1 && tn[0] == 'a' {
572                                 if tt == StartTagToken {
573                                         depth++
574                                 } else {
575                                         depth--
576                                 }
577                         }
578                 }
579         }
580         u := "14567"
581         v := string(result.Bytes())
582         if u != v {
583                 t.Errorf("TestBufAPI: want %q got %q", u, v)
584         }
585 }
586
587 func TestConvertNewlines(t *testing.T) {
588         testCases := map[string]string{
589                 "Mac\rDOS\r\nUnix\n":    "Mac\nDOS\nUnix\n",
590                 "Unix\nMac\rDOS\r\n":    "Unix\nMac\nDOS\n",
591                 "DOS\r\nDOS\r\nDOS\r\n": "DOS\nDOS\nDOS\n",
592                 "":         "",
593                 "\n":       "\n",
594                 "\n\r":     "\n\n",
595                 "\r":       "\n",
596                 "\r\n":     "\n",
597                 "\r\n\n":   "\n\n",
598                 "\r\n\r":   "\n\n",
599                 "\r\n\r\n": "\n\n",
600                 "\r\r":     "\n\n",
601                 "\r\r\n":   "\n\n",
602                 "\r\r\n\n": "\n\n\n",
603                 "\r\r\r\n": "\n\n\n",
604                 "\r \n":    "\n \n",
605                 "xyz":      "xyz",
606         }
607         for in, want := range testCases {
608                 if got := string(convertNewlines([]byte(in))); got != want {
609                         t.Errorf("input %q: got %q, want %q", in, got, want)
610                 }
611         }
612 }
613
614 func TestReaderEdgeCases(t *testing.T) {
615         const s = "<p>An io.Reader can return (0, nil) or (n, io.EOF).</p>"
616         testCases := []io.Reader{
617                 &zeroOneByteReader{s: s},
618                 &eofStringsReader{s: s},
619                 &stuckReader{},
620         }
621         for i, tc := range testCases {
622                 got := []TokenType{}
623                 z := NewTokenizer(tc)
624                 for {
625                         tt := z.Next()
626                         if tt == ErrorToken {
627                                 break
628                         }
629                         got = append(got, tt)
630                 }
631                 if err := z.Err(); err != nil && err != io.EOF {
632                         if err != io.ErrNoProgress {
633                                 t.Errorf("i=%d: %v", i, err)
634                         }
635                         continue
636                 }
637                 want := []TokenType{
638                         StartTagToken,
639                         TextToken,
640                         EndTagToken,
641                 }
642                 if !reflect.DeepEqual(got, want) {
643                         t.Errorf("i=%d: got %v, want %v", i, got, want)
644                         continue
645                 }
646         }
647 }
648
649 // zeroOneByteReader is like a strings.Reader that alternates between
650 // returning 0 bytes and 1 byte at a time.
651 type zeroOneByteReader struct {
652         s string
653         n int
654 }
655
656 func (r *zeroOneByteReader) Read(p []byte) (int, error) {
657         if len(p) == 0 {
658                 return 0, nil
659         }
660         if len(r.s) == 0 {
661                 return 0, io.EOF
662         }
663         r.n++
664         if r.n%2 != 0 {
665                 return 0, nil
666         }
667         p[0], r.s = r.s[0], r.s[1:]
668         return 1, nil
669 }
670
671 // eofStringsReader is like a strings.Reader but can return an (n, err) where
672 // n > 0 && err != nil.
673 type eofStringsReader struct {
674         s string
675 }
676
677 func (r *eofStringsReader) Read(p []byte) (int, error) {
678         n := copy(p, r.s)
679         r.s = r.s[n:]
680         if r.s != "" {
681                 return n, nil
682         }
683         return n, io.EOF
684 }
685
686 // stuckReader is an io.Reader that always returns no data and no error.
687 type stuckReader struct{}
688
689 func (*stuckReader) Read(p []byte) (int, error) {
690         return 0, nil
691 }
692
693 const (
694         rawLevel = iota
695         lowLevel
696         highLevel
697 )
698
699 func benchmarkTokenizer(b *testing.B, level int) {
700         buf, err := ioutil.ReadFile("testdata/go1.html")
701         if err != nil {
702                 b.Fatalf("could not read testdata/go1.html: %v", err)
703         }
704         b.SetBytes(int64(len(buf)))
705         runtime.GC()
706         b.ReportAllocs()
707         b.ResetTimer()
708         for i := 0; i < b.N; i++ {
709                 z := NewTokenizer(bytes.NewBuffer(buf))
710                 for {
711                         tt := z.Next()
712                         if tt == ErrorToken {
713                                 if err := z.Err(); err != nil && err != io.EOF {
714                                         b.Fatalf("tokenizer error: %v", err)
715                                 }
716                                 break
717                         }
718                         switch level {
719                         case rawLevel:
720                                 // Calling z.Raw just returns the raw bytes of the token. It does
721                                 // not unescape &lt; to <, or lower-case tag names and attribute keys.
722                                 z.Raw()
723                         case lowLevel:
724                                 // Caling z.Text, z.TagName and z.TagAttr returns []byte values
725                                 // whose contents may change on the next call to z.Next.
726                                 switch tt {
727                                 case TextToken, CommentToken, DoctypeToken:
728                                         z.Text()
729                                 case StartTagToken, SelfClosingTagToken:
730                                         _, more := z.TagName()
731                                         for more {
732                                                 _, _, more = z.TagAttr()
733                                         }
734                                 case EndTagToken:
735                                         z.TagName()
736                                 }
737                         case highLevel:
738                                 // Calling z.Token converts []byte values to strings whose validity
739                                 // extend beyond the next call to z.Next.
740                                 z.Token()
741                         }
742                 }
743         }
744 }
745
746 func BenchmarkRawLevelTokenizer(b *testing.B)  { benchmarkTokenizer(b, rawLevel) }
747 func BenchmarkLowLevelTokenizer(b *testing.B)  { benchmarkTokenizer(b, lowLevel) }
748 func BenchmarkHighLevelTokenizer(b *testing.B) { benchmarkTokenizer(b, highLevel) }