OSDN Git Service

new repo
[bytom/vapor.git] / vendor / golang.org / x / net / html / parse.go
1 // Copyright 2010 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 package html
6
7 import (
8         "errors"
9         "fmt"
10         "io"
11         "strings"
12
13         a "golang.org/x/net/html/atom"
14 )
15
16 // A parser implements the HTML5 parsing algorithm:
17 // https://html.spec.whatwg.org/multipage/syntax.html#tree-construction
18 type parser struct {
19         // tokenizer provides the tokens for the parser.
20         tokenizer *Tokenizer
21         // tok is the most recently read token.
22         tok Token
23         // Self-closing tags like <hr/> are treated as start tags, except that
24         // hasSelfClosingToken is set while they are being processed.
25         hasSelfClosingToken bool
26         // doc is the document root element.
27         doc *Node
28         // The stack of open elements (section 12.2.3.2) and active formatting
29         // elements (section 12.2.3.3).
30         oe, afe nodeStack
31         // Element pointers (section 12.2.3.4).
32         head, form *Node
33         // Other parsing state flags (section 12.2.3.5).
34         scripting, framesetOK bool
35         // im is the current insertion mode.
36         im insertionMode
37         // originalIM is the insertion mode to go back to after completing a text
38         // or inTableText insertion mode.
39         originalIM insertionMode
40         // fosterParenting is whether new elements should be inserted according to
41         // the foster parenting rules (section 12.2.5.3).
42         fosterParenting bool
43         // quirks is whether the parser is operating in "quirks mode."
44         quirks bool
45         // fragment is whether the parser is parsing an HTML fragment.
46         fragment bool
47         // context is the context element when parsing an HTML fragment
48         // (section 12.4).
49         context *Node
50 }
51
52 func (p *parser) top() *Node {
53         if n := p.oe.top(); n != nil {
54                 return n
55         }
56         return p.doc
57 }
58
59 // Stop tags for use in popUntil. These come from section 12.2.3.2.
60 var (
61         defaultScopeStopTags = map[string][]a.Atom{
62                 "":     {a.Applet, a.Caption, a.Html, a.Table, a.Td, a.Th, a.Marquee, a.Object, a.Template},
63                 "math": {a.AnnotationXml, a.Mi, a.Mn, a.Mo, a.Ms, a.Mtext},
64                 "svg":  {a.Desc, a.ForeignObject, a.Title},
65         }
66 )
67
68 type scope int
69
70 const (
71         defaultScope scope = iota
72         listItemScope
73         buttonScope
74         tableScope
75         tableRowScope
76         tableBodyScope
77         selectScope
78 )
79
80 // popUntil pops the stack of open elements at the highest element whose tag
81 // is in matchTags, provided there is no higher element in the scope's stop
82 // tags (as defined in section 12.2.3.2). It returns whether or not there was
83 // such an element. If there was not, popUntil leaves the stack unchanged.
84 //
85 // For example, the set of stop tags for table scope is: "html", "table". If
86 // the stack was:
87 // ["html", "body", "font", "table", "b", "i", "u"]
88 // then popUntil(tableScope, "font") would return false, but
89 // popUntil(tableScope, "i") would return true and the stack would become:
90 // ["html", "body", "font", "table", "b"]
91 //
92 // If an element's tag is in both the stop tags and matchTags, then the stack
93 // will be popped and the function returns true (provided, of course, there was
94 // no higher element in the stack that was also in the stop tags). For example,
95 // popUntil(tableScope, "table") returns true and leaves:
96 // ["html", "body", "font"]
97 func (p *parser) popUntil(s scope, matchTags ...a.Atom) bool {
98         if i := p.indexOfElementInScope(s, matchTags...); i != -1 {
99                 p.oe = p.oe[:i]
100                 return true
101         }
102         return false
103 }
104
105 // indexOfElementInScope returns the index in p.oe of the highest element whose
106 // tag is in matchTags that is in scope. If no matching element is in scope, it
107 // returns -1.
108 func (p *parser) indexOfElementInScope(s scope, matchTags ...a.Atom) int {
109         for i := len(p.oe) - 1; i >= 0; i-- {
110                 tagAtom := p.oe[i].DataAtom
111                 if p.oe[i].Namespace == "" {
112                         for _, t := range matchTags {
113                                 if t == tagAtom {
114                                         return i
115                                 }
116                         }
117                         switch s {
118                         case defaultScope:
119                                 // No-op.
120                         case listItemScope:
121                                 if tagAtom == a.Ol || tagAtom == a.Ul {
122                                         return -1
123                                 }
124                         case buttonScope:
125                                 if tagAtom == a.Button {
126                                         return -1
127                                 }
128                         case tableScope:
129                                 if tagAtom == a.Html || tagAtom == a.Table {
130                                         return -1
131                                 }
132                         case selectScope:
133                                 if tagAtom != a.Optgroup && tagAtom != a.Option {
134                                         return -1
135                                 }
136                         default:
137                                 panic("unreachable")
138                         }
139                 }
140                 switch s {
141                 case defaultScope, listItemScope, buttonScope:
142                         for _, t := range defaultScopeStopTags[p.oe[i].Namespace] {
143                                 if t == tagAtom {
144                                         return -1
145                                 }
146                         }
147                 }
148         }
149         return -1
150 }
151
152 // elementInScope is like popUntil, except that it doesn't modify the stack of
153 // open elements.
154 func (p *parser) elementInScope(s scope, matchTags ...a.Atom) bool {
155         return p.indexOfElementInScope(s, matchTags...) != -1
156 }
157
158 // clearStackToContext pops elements off the stack of open elements until a
159 // scope-defined element is found.
160 func (p *parser) clearStackToContext(s scope) {
161         for i := len(p.oe) - 1; i >= 0; i-- {
162                 tagAtom := p.oe[i].DataAtom
163                 switch s {
164                 case tableScope:
165                         if tagAtom == a.Html || tagAtom == a.Table {
166                                 p.oe = p.oe[:i+1]
167                                 return
168                         }
169                 case tableRowScope:
170                         if tagAtom == a.Html || tagAtom == a.Tr {
171                                 p.oe = p.oe[:i+1]
172                                 return
173                         }
174                 case tableBodyScope:
175                         if tagAtom == a.Html || tagAtom == a.Tbody || tagAtom == a.Tfoot || tagAtom == a.Thead {
176                                 p.oe = p.oe[:i+1]
177                                 return
178                         }
179                 default:
180                         panic("unreachable")
181                 }
182         }
183 }
184
185 // generateImpliedEndTags pops nodes off the stack of open elements as long as
186 // the top node has a tag name of dd, dt, li, option, optgroup, p, rp, or rt.
187 // If exceptions are specified, nodes with that name will not be popped off.
188 func (p *parser) generateImpliedEndTags(exceptions ...string) {
189         var i int
190 loop:
191         for i = len(p.oe) - 1; i >= 0; i-- {
192                 n := p.oe[i]
193                 if n.Type == ElementNode {
194                         switch n.DataAtom {
195                         case a.Dd, a.Dt, a.Li, a.Option, a.Optgroup, a.P, a.Rp, a.Rt:
196                                 for _, except := range exceptions {
197                                         if n.Data == except {
198                                                 break loop
199                                         }
200                                 }
201                                 continue
202                         }
203                 }
204                 break
205         }
206
207         p.oe = p.oe[:i+1]
208 }
209
210 // addChild adds a child node n to the top element, and pushes n onto the stack
211 // of open elements if it is an element node.
212 func (p *parser) addChild(n *Node) {
213         if p.shouldFosterParent() {
214                 p.fosterParent(n)
215         } else {
216                 p.top().AppendChild(n)
217         }
218
219         if n.Type == ElementNode {
220                 p.oe = append(p.oe, n)
221         }
222 }
223
224 // shouldFosterParent returns whether the next node to be added should be
225 // foster parented.
226 func (p *parser) shouldFosterParent() bool {
227         if p.fosterParenting {
228                 switch p.top().DataAtom {
229                 case a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr:
230                         return true
231                 }
232         }
233         return false
234 }
235
236 // fosterParent adds a child node according to the foster parenting rules.
237 // Section 12.2.5.3, "foster parenting".
238 func (p *parser) fosterParent(n *Node) {
239         var table, parent, prev *Node
240         var i int
241         for i = len(p.oe) - 1; i >= 0; i-- {
242                 if p.oe[i].DataAtom == a.Table {
243                         table = p.oe[i]
244                         break
245                 }
246         }
247
248         if table == nil {
249                 // The foster parent is the html element.
250                 parent = p.oe[0]
251         } else {
252                 parent = table.Parent
253         }
254         if parent == nil {
255                 parent = p.oe[i-1]
256         }
257
258         if table != nil {
259                 prev = table.PrevSibling
260         } else {
261                 prev = parent.LastChild
262         }
263         if prev != nil && prev.Type == TextNode && n.Type == TextNode {
264                 prev.Data += n.Data
265                 return
266         }
267
268         parent.InsertBefore(n, table)
269 }
270
271 // addText adds text to the preceding node if it is a text node, or else it
272 // calls addChild with a new text node.
273 func (p *parser) addText(text string) {
274         if text == "" {
275                 return
276         }
277
278         if p.shouldFosterParent() {
279                 p.fosterParent(&Node{
280                         Type: TextNode,
281                         Data: text,
282                 })
283                 return
284         }
285
286         t := p.top()
287         if n := t.LastChild; n != nil && n.Type == TextNode {
288                 n.Data += text
289                 return
290         }
291         p.addChild(&Node{
292                 Type: TextNode,
293                 Data: text,
294         })
295 }
296
297 // addElement adds a child element based on the current token.
298 func (p *parser) addElement() {
299         p.addChild(&Node{
300                 Type:     ElementNode,
301                 DataAtom: p.tok.DataAtom,
302                 Data:     p.tok.Data,
303                 Attr:     p.tok.Attr,
304         })
305 }
306
307 // Section 12.2.3.3.
308 func (p *parser) addFormattingElement() {
309         tagAtom, attr := p.tok.DataAtom, p.tok.Attr
310         p.addElement()
311
312         // Implement the Noah's Ark clause, but with three per family instead of two.
313         identicalElements := 0
314 findIdenticalElements:
315         for i := len(p.afe) - 1; i >= 0; i-- {
316                 n := p.afe[i]
317                 if n.Type == scopeMarkerNode {
318                         break
319                 }
320                 if n.Type != ElementNode {
321                         continue
322                 }
323                 if n.Namespace != "" {
324                         continue
325                 }
326                 if n.DataAtom != tagAtom {
327                         continue
328                 }
329                 if len(n.Attr) != len(attr) {
330                         continue
331                 }
332         compareAttributes:
333                 for _, t0 := range n.Attr {
334                         for _, t1 := range attr {
335                                 if t0.Key == t1.Key && t0.Namespace == t1.Namespace && t0.Val == t1.Val {
336                                         // Found a match for this attribute, continue with the next attribute.
337                                         continue compareAttributes
338                                 }
339                         }
340                         // If we get here, there is no attribute that matches a.
341                         // Therefore the element is not identical to the new one.
342                         continue findIdenticalElements
343                 }
344
345                 identicalElements++
346                 if identicalElements >= 3 {
347                         p.afe.remove(n)
348                 }
349         }
350
351         p.afe = append(p.afe, p.top())
352 }
353
354 // Section 12.2.3.3.
355 func (p *parser) clearActiveFormattingElements() {
356         for {
357                 n := p.afe.pop()
358                 if len(p.afe) == 0 || n.Type == scopeMarkerNode {
359                         return
360                 }
361         }
362 }
363
364 // Section 12.2.3.3.
365 func (p *parser) reconstructActiveFormattingElements() {
366         n := p.afe.top()
367         if n == nil {
368                 return
369         }
370         if n.Type == scopeMarkerNode || p.oe.index(n) != -1 {
371                 return
372         }
373         i := len(p.afe) - 1
374         for n.Type != scopeMarkerNode && p.oe.index(n) == -1 {
375                 if i == 0 {
376                         i = -1
377                         break
378                 }
379                 i--
380                 n = p.afe[i]
381         }
382         for {
383                 i++
384                 clone := p.afe[i].clone()
385                 p.addChild(clone)
386                 p.afe[i] = clone
387                 if i == len(p.afe)-1 {
388                         break
389                 }
390         }
391 }
392
393 // Section 12.2.4.
394 func (p *parser) acknowledgeSelfClosingTag() {
395         p.hasSelfClosingToken = false
396 }
397
398 // An insertion mode (section 12.2.3.1) is the state transition function from
399 // a particular state in the HTML5 parser's state machine. It updates the
400 // parser's fields depending on parser.tok (where ErrorToken means EOF).
401 // It returns whether the token was consumed.
402 type insertionMode func(*parser) bool
403
404 // setOriginalIM sets the insertion mode to return to after completing a text or
405 // inTableText insertion mode.
406 // Section 12.2.3.1, "using the rules for".
407 func (p *parser) setOriginalIM() {
408         if p.originalIM != nil {
409                 panic("html: bad parser state: originalIM was set twice")
410         }
411         p.originalIM = p.im
412 }
413
414 // Section 12.2.3.1, "reset the insertion mode".
415 func (p *parser) resetInsertionMode() {
416         for i := len(p.oe) - 1; i >= 0; i-- {
417                 n := p.oe[i]
418                 if i == 0 && p.context != nil {
419                         n = p.context
420                 }
421
422                 switch n.DataAtom {
423                 case a.Select:
424                         p.im = inSelectIM
425                 case a.Td, a.Th:
426                         p.im = inCellIM
427                 case a.Tr:
428                         p.im = inRowIM
429                 case a.Tbody, a.Thead, a.Tfoot:
430                         p.im = inTableBodyIM
431                 case a.Caption:
432                         p.im = inCaptionIM
433                 case a.Colgroup:
434                         p.im = inColumnGroupIM
435                 case a.Table:
436                         p.im = inTableIM
437                 case a.Head:
438                         p.im = inBodyIM
439                 case a.Body:
440                         p.im = inBodyIM
441                 case a.Frameset:
442                         p.im = inFramesetIM
443                 case a.Html:
444                         p.im = beforeHeadIM
445                 default:
446                         continue
447                 }
448                 return
449         }
450         p.im = inBodyIM
451 }
452
453 const whitespace = " \t\r\n\f"
454
455 // Section 12.2.5.4.1.
456 func initialIM(p *parser) bool {
457         switch p.tok.Type {
458         case TextToken:
459                 p.tok.Data = strings.TrimLeft(p.tok.Data, whitespace)
460                 if len(p.tok.Data) == 0 {
461                         // It was all whitespace, so ignore it.
462                         return true
463                 }
464         case CommentToken:
465                 p.doc.AppendChild(&Node{
466                         Type: CommentNode,
467                         Data: p.tok.Data,
468                 })
469                 return true
470         case DoctypeToken:
471                 n, quirks := parseDoctype(p.tok.Data)
472                 p.doc.AppendChild(n)
473                 p.quirks = quirks
474                 p.im = beforeHTMLIM
475                 return true
476         }
477         p.quirks = true
478         p.im = beforeHTMLIM
479         return false
480 }
481
482 // Section 12.2.5.4.2.
483 func beforeHTMLIM(p *parser) bool {
484         switch p.tok.Type {
485         case DoctypeToken:
486                 // Ignore the token.
487                 return true
488         case TextToken:
489                 p.tok.Data = strings.TrimLeft(p.tok.Data, whitespace)
490                 if len(p.tok.Data) == 0 {
491                         // It was all whitespace, so ignore it.
492                         return true
493                 }
494         case StartTagToken:
495                 if p.tok.DataAtom == a.Html {
496                         p.addElement()
497                         p.im = beforeHeadIM
498                         return true
499                 }
500         case EndTagToken:
501                 switch p.tok.DataAtom {
502                 case a.Head, a.Body, a.Html, a.Br:
503                         p.parseImpliedToken(StartTagToken, a.Html, a.Html.String())
504                         return false
505                 default:
506                         // Ignore the token.
507                         return true
508                 }
509         case CommentToken:
510                 p.doc.AppendChild(&Node{
511                         Type: CommentNode,
512                         Data: p.tok.Data,
513                 })
514                 return true
515         }
516         p.parseImpliedToken(StartTagToken, a.Html, a.Html.String())
517         return false
518 }
519
520 // Section 12.2.5.4.3.
521 func beforeHeadIM(p *parser) bool {
522         switch p.tok.Type {
523         case TextToken:
524                 p.tok.Data = strings.TrimLeft(p.tok.Data, whitespace)
525                 if len(p.tok.Data) == 0 {
526                         // It was all whitespace, so ignore it.
527                         return true
528                 }
529         case StartTagToken:
530                 switch p.tok.DataAtom {
531                 case a.Head:
532                         p.addElement()
533                         p.head = p.top()
534                         p.im = inHeadIM
535                         return true
536                 case a.Html:
537                         return inBodyIM(p)
538                 }
539         case EndTagToken:
540                 switch p.tok.DataAtom {
541                 case a.Head, a.Body, a.Html, a.Br:
542                         p.parseImpliedToken(StartTagToken, a.Head, a.Head.String())
543                         return false
544                 default:
545                         // Ignore the token.
546                         return true
547                 }
548         case CommentToken:
549                 p.addChild(&Node{
550                         Type: CommentNode,
551                         Data: p.tok.Data,
552                 })
553                 return true
554         case DoctypeToken:
555                 // Ignore the token.
556                 return true
557         }
558
559         p.parseImpliedToken(StartTagToken, a.Head, a.Head.String())
560         return false
561 }
562
563 // Section 12.2.5.4.4.
564 func inHeadIM(p *parser) bool {
565         switch p.tok.Type {
566         case TextToken:
567                 s := strings.TrimLeft(p.tok.Data, whitespace)
568                 if len(s) < len(p.tok.Data) {
569                         // Add the initial whitespace to the current node.
570                         p.addText(p.tok.Data[:len(p.tok.Data)-len(s)])
571                         if s == "" {
572                                 return true
573                         }
574                         p.tok.Data = s
575                 }
576         case StartTagToken:
577                 switch p.tok.DataAtom {
578                 case a.Html:
579                         return inBodyIM(p)
580                 case a.Base, a.Basefont, a.Bgsound, a.Command, a.Link, a.Meta:
581                         p.addElement()
582                         p.oe.pop()
583                         p.acknowledgeSelfClosingTag()
584                         return true
585                 case a.Script, a.Title, a.Noscript, a.Noframes, a.Style:
586                         p.addElement()
587                         p.setOriginalIM()
588                         p.im = textIM
589                         return true
590                 case a.Head:
591                         // Ignore the token.
592                         return true
593                 }
594         case EndTagToken:
595                 switch p.tok.DataAtom {
596                 case a.Head:
597                         n := p.oe.pop()
598                         if n.DataAtom != a.Head {
599                                 panic("html: bad parser state: <head> element not found, in the in-head insertion mode")
600                         }
601                         p.im = afterHeadIM
602                         return true
603                 case a.Body, a.Html, a.Br:
604                         p.parseImpliedToken(EndTagToken, a.Head, a.Head.String())
605                         return false
606                 default:
607                         // Ignore the token.
608                         return true
609                 }
610         case CommentToken:
611                 p.addChild(&Node{
612                         Type: CommentNode,
613                         Data: p.tok.Data,
614                 })
615                 return true
616         case DoctypeToken:
617                 // Ignore the token.
618                 return true
619         }
620
621         p.parseImpliedToken(EndTagToken, a.Head, a.Head.String())
622         return false
623 }
624
625 // Section 12.2.5.4.6.
626 func afterHeadIM(p *parser) bool {
627         switch p.tok.Type {
628         case TextToken:
629                 s := strings.TrimLeft(p.tok.Data, whitespace)
630                 if len(s) < len(p.tok.Data) {
631                         // Add the initial whitespace to the current node.
632                         p.addText(p.tok.Data[:len(p.tok.Data)-len(s)])
633                         if s == "" {
634                                 return true
635                         }
636                         p.tok.Data = s
637                 }
638         case StartTagToken:
639                 switch p.tok.DataAtom {
640                 case a.Html:
641                         return inBodyIM(p)
642                 case a.Body:
643                         p.addElement()
644                         p.framesetOK = false
645                         p.im = inBodyIM
646                         return true
647                 case a.Frameset:
648                         p.addElement()
649                         p.im = inFramesetIM
650                         return true
651                 case a.Base, a.Basefont, a.Bgsound, a.Link, a.Meta, a.Noframes, a.Script, a.Style, a.Title:
652                         p.oe = append(p.oe, p.head)
653                         defer p.oe.remove(p.head)
654                         return inHeadIM(p)
655                 case a.Head:
656                         // Ignore the token.
657                         return true
658                 }
659         case EndTagToken:
660                 switch p.tok.DataAtom {
661                 case a.Body, a.Html, a.Br:
662                         // Drop down to creating an implied <body> tag.
663                 default:
664                         // Ignore the token.
665                         return true
666                 }
667         case CommentToken:
668                 p.addChild(&Node{
669                         Type: CommentNode,
670                         Data: p.tok.Data,
671                 })
672                 return true
673         case DoctypeToken:
674                 // Ignore the token.
675                 return true
676         }
677
678         p.parseImpliedToken(StartTagToken, a.Body, a.Body.String())
679         p.framesetOK = true
680         return false
681 }
682
683 // copyAttributes copies attributes of src not found on dst to dst.
684 func copyAttributes(dst *Node, src Token) {
685         if len(src.Attr) == 0 {
686                 return
687         }
688         attr := map[string]string{}
689         for _, t := range dst.Attr {
690                 attr[t.Key] = t.Val
691         }
692         for _, t := range src.Attr {
693                 if _, ok := attr[t.Key]; !ok {
694                         dst.Attr = append(dst.Attr, t)
695                         attr[t.Key] = t.Val
696                 }
697         }
698 }
699
700 // Section 12.2.5.4.7.
701 func inBodyIM(p *parser) bool {
702         switch p.tok.Type {
703         case TextToken:
704                 d := p.tok.Data
705                 switch n := p.oe.top(); n.DataAtom {
706                 case a.Pre, a.Listing:
707                         if n.FirstChild == nil {
708                                 // Ignore a newline at the start of a <pre> block.
709                                 if d != "" && d[0] == '\r' {
710                                         d = d[1:]
711                                 }
712                                 if d != "" && d[0] == '\n' {
713                                         d = d[1:]
714                                 }
715                         }
716                 }
717                 d = strings.Replace(d, "\x00", "", -1)
718                 if d == "" {
719                         return true
720                 }
721                 p.reconstructActiveFormattingElements()
722                 p.addText(d)
723                 if p.framesetOK && strings.TrimLeft(d, whitespace) != "" {
724                         // There were non-whitespace characters inserted.
725                         p.framesetOK = false
726                 }
727         case StartTagToken:
728                 switch p.tok.DataAtom {
729                 case a.Html:
730                         copyAttributes(p.oe[0], p.tok)
731                 case a.Base, a.Basefont, a.Bgsound, a.Command, a.Link, a.Meta, a.Noframes, a.Script, a.Style, a.Title:
732                         return inHeadIM(p)
733                 case a.Body:
734                         if len(p.oe) >= 2 {
735                                 body := p.oe[1]
736                                 if body.Type == ElementNode && body.DataAtom == a.Body {
737                                         p.framesetOK = false
738                                         copyAttributes(body, p.tok)
739                                 }
740                         }
741                 case a.Frameset:
742                         if !p.framesetOK || len(p.oe) < 2 || p.oe[1].DataAtom != a.Body {
743                                 // Ignore the token.
744                                 return true
745                         }
746                         body := p.oe[1]
747                         if body.Parent != nil {
748                                 body.Parent.RemoveChild(body)
749                         }
750                         p.oe = p.oe[:1]
751                         p.addElement()
752                         p.im = inFramesetIM
753                         return true
754                 case a.Address, a.Article, a.Aside, a.Blockquote, a.Center, a.Details, a.Dir, a.Div, a.Dl, a.Fieldset, a.Figcaption, a.Figure, a.Footer, a.Header, a.Hgroup, a.Menu, a.Nav, a.Ol, a.P, a.Section, a.Summary, a.Ul:
755                         p.popUntil(buttonScope, a.P)
756                         p.addElement()
757                 case a.H1, a.H2, a.H3, a.H4, a.H5, a.H6:
758                         p.popUntil(buttonScope, a.P)
759                         switch n := p.top(); n.DataAtom {
760                         case a.H1, a.H2, a.H3, a.H4, a.H5, a.H6:
761                                 p.oe.pop()
762                         }
763                         p.addElement()
764                 case a.Pre, a.Listing:
765                         p.popUntil(buttonScope, a.P)
766                         p.addElement()
767                         // The newline, if any, will be dealt with by the TextToken case.
768                         p.framesetOK = false
769                 case a.Form:
770                         if p.form == nil {
771                                 p.popUntil(buttonScope, a.P)
772                                 p.addElement()
773                                 p.form = p.top()
774                         }
775                 case a.Li:
776                         p.framesetOK = false
777                         for i := len(p.oe) - 1; i >= 0; i-- {
778                                 node := p.oe[i]
779                                 switch node.DataAtom {
780                                 case a.Li:
781                                         p.oe = p.oe[:i]
782                                 case a.Address, a.Div, a.P:
783                                         continue
784                                 default:
785                                         if !isSpecialElement(node) {
786                                                 continue
787                                         }
788                                 }
789                                 break
790                         }
791                         p.popUntil(buttonScope, a.P)
792                         p.addElement()
793                 case a.Dd, a.Dt:
794                         p.framesetOK = false
795                         for i := len(p.oe) - 1; i >= 0; i-- {
796                                 node := p.oe[i]
797                                 switch node.DataAtom {
798                                 case a.Dd, a.Dt:
799                                         p.oe = p.oe[:i]
800                                 case a.Address, a.Div, a.P:
801                                         continue
802                                 default:
803                                         if !isSpecialElement(node) {
804                                                 continue
805                                         }
806                                 }
807                                 break
808                         }
809                         p.popUntil(buttonScope, a.P)
810                         p.addElement()
811                 case a.Plaintext:
812                         p.popUntil(buttonScope, a.P)
813                         p.addElement()
814                 case a.Button:
815                         p.popUntil(defaultScope, a.Button)
816                         p.reconstructActiveFormattingElements()
817                         p.addElement()
818                         p.framesetOK = false
819                 case a.A:
820                         for i := len(p.afe) - 1; i >= 0 && p.afe[i].Type != scopeMarkerNode; i-- {
821                                 if n := p.afe[i]; n.Type == ElementNode && n.DataAtom == a.A {
822                                         p.inBodyEndTagFormatting(a.A)
823                                         p.oe.remove(n)
824                                         p.afe.remove(n)
825                                         break
826                                 }
827                         }
828                         p.reconstructActiveFormattingElements()
829                         p.addFormattingElement()
830                 case a.B, a.Big, a.Code, a.Em, a.Font, a.I, a.S, a.Small, a.Strike, a.Strong, a.Tt, a.U:
831                         p.reconstructActiveFormattingElements()
832                         p.addFormattingElement()
833                 case a.Nobr:
834                         p.reconstructActiveFormattingElements()
835                         if p.elementInScope(defaultScope, a.Nobr) {
836                                 p.inBodyEndTagFormatting(a.Nobr)
837                                 p.reconstructActiveFormattingElements()
838                         }
839                         p.addFormattingElement()
840                 case a.Applet, a.Marquee, a.Object:
841                         p.reconstructActiveFormattingElements()
842                         p.addElement()
843                         p.afe = append(p.afe, &scopeMarker)
844                         p.framesetOK = false
845                 case a.Table:
846                         if !p.quirks {
847                                 p.popUntil(buttonScope, a.P)
848                         }
849                         p.addElement()
850                         p.framesetOK = false
851                         p.im = inTableIM
852                         return true
853                 case a.Area, a.Br, a.Embed, a.Img, a.Input, a.Keygen, a.Wbr:
854                         p.reconstructActiveFormattingElements()
855                         p.addElement()
856                         p.oe.pop()
857                         p.acknowledgeSelfClosingTag()
858                         if p.tok.DataAtom == a.Input {
859                                 for _, t := range p.tok.Attr {
860                                         if t.Key == "type" {
861                                                 if strings.ToLower(t.Val) == "hidden" {
862                                                         // Skip setting framesetOK = false
863                                                         return true
864                                                 }
865                                         }
866                                 }
867                         }
868                         p.framesetOK = false
869                 case a.Param, a.Source, a.Track:
870                         p.addElement()
871                         p.oe.pop()
872                         p.acknowledgeSelfClosingTag()
873                 case a.Hr:
874                         p.popUntil(buttonScope, a.P)
875                         p.addElement()
876                         p.oe.pop()
877                         p.acknowledgeSelfClosingTag()
878                         p.framesetOK = false
879                 case a.Image:
880                         p.tok.DataAtom = a.Img
881                         p.tok.Data = a.Img.String()
882                         return false
883                 case a.Isindex:
884                         if p.form != nil {
885                                 // Ignore the token.
886                                 return true
887                         }
888                         action := ""
889                         prompt := "This is a searchable index. Enter search keywords: "
890                         attr := []Attribute{{Key: "name", Val: "isindex"}}
891                         for _, t := range p.tok.Attr {
892                                 switch t.Key {
893                                 case "action":
894                                         action = t.Val
895                                 case "name":
896                                         // Ignore the attribute.
897                                 case "prompt":
898                                         prompt = t.Val
899                                 default:
900                                         attr = append(attr, t)
901                                 }
902                         }
903                         p.acknowledgeSelfClosingTag()
904                         p.popUntil(buttonScope, a.P)
905                         p.parseImpliedToken(StartTagToken, a.Form, a.Form.String())
906                         if action != "" {
907                                 p.form.Attr = []Attribute{{Key: "action", Val: action}}
908                         }
909                         p.parseImpliedToken(StartTagToken, a.Hr, a.Hr.String())
910                         p.parseImpliedToken(StartTagToken, a.Label, a.Label.String())
911                         p.addText(prompt)
912                         p.addChild(&Node{
913                                 Type:     ElementNode,
914                                 DataAtom: a.Input,
915                                 Data:     a.Input.String(),
916                                 Attr:     attr,
917                         })
918                         p.oe.pop()
919                         p.parseImpliedToken(EndTagToken, a.Label, a.Label.String())
920                         p.parseImpliedToken(StartTagToken, a.Hr, a.Hr.String())
921                         p.parseImpliedToken(EndTagToken, a.Form, a.Form.String())
922                 case a.Textarea:
923                         p.addElement()
924                         p.setOriginalIM()
925                         p.framesetOK = false
926                         p.im = textIM
927                 case a.Xmp:
928                         p.popUntil(buttonScope, a.P)
929                         p.reconstructActiveFormattingElements()
930                         p.framesetOK = false
931                         p.addElement()
932                         p.setOriginalIM()
933                         p.im = textIM
934                 case a.Iframe:
935                         p.framesetOK = false
936                         p.addElement()
937                         p.setOriginalIM()
938                         p.im = textIM
939                 case a.Noembed, a.Noscript:
940                         p.addElement()
941                         p.setOriginalIM()
942                         p.im = textIM
943                 case a.Select:
944                         p.reconstructActiveFormattingElements()
945                         p.addElement()
946                         p.framesetOK = false
947                         p.im = inSelectIM
948                         return true
949                 case a.Optgroup, a.Option:
950                         if p.top().DataAtom == a.Option {
951                                 p.oe.pop()
952                         }
953                         p.reconstructActiveFormattingElements()
954                         p.addElement()
955                 case a.Rp, a.Rt:
956                         if p.elementInScope(defaultScope, a.Ruby) {
957                                 p.generateImpliedEndTags()
958                         }
959                         p.addElement()
960                 case a.Math, a.Svg:
961                         p.reconstructActiveFormattingElements()
962                         if p.tok.DataAtom == a.Math {
963                                 adjustAttributeNames(p.tok.Attr, mathMLAttributeAdjustments)
964                         } else {
965                                 adjustAttributeNames(p.tok.Attr, svgAttributeAdjustments)
966                         }
967                         adjustForeignAttributes(p.tok.Attr)
968                         p.addElement()
969                         p.top().Namespace = p.tok.Data
970                         if p.hasSelfClosingToken {
971                                 p.oe.pop()
972                                 p.acknowledgeSelfClosingTag()
973                         }
974                         return true
975                 case a.Caption, a.Col, a.Colgroup, a.Frame, a.Head, a.Tbody, a.Td, a.Tfoot, a.Th, a.Thead, a.Tr:
976                         // Ignore the token.
977                 default:
978                         p.reconstructActiveFormattingElements()
979                         p.addElement()
980                 }
981         case EndTagToken:
982                 switch p.tok.DataAtom {
983                 case a.Body:
984                         if p.elementInScope(defaultScope, a.Body) {
985                                 p.im = afterBodyIM
986                         }
987                 case a.Html:
988                         if p.elementInScope(defaultScope, a.Body) {
989                                 p.parseImpliedToken(EndTagToken, a.Body, a.Body.String())
990                                 return false
991                         }
992                         return true
993                 case a.Address, a.Article, a.Aside, a.Blockquote, a.Button, a.Center, a.Details, a.Dir, a.Div, a.Dl, a.Fieldset, a.Figcaption, a.Figure, a.Footer, a.Header, a.Hgroup, a.Listing, a.Menu, a.Nav, a.Ol, a.Pre, a.Section, a.Summary, a.Ul:
994                         p.popUntil(defaultScope, p.tok.DataAtom)
995                 case a.Form:
996                         node := p.form
997                         p.form = nil
998                         i := p.indexOfElementInScope(defaultScope, a.Form)
999                         if node == nil || i == -1 || p.oe[i] != node {
1000                                 // Ignore the token.
1001                                 return true
1002                         }
1003                         p.generateImpliedEndTags()
1004                         p.oe.remove(node)
1005                 case a.P:
1006                         if !p.elementInScope(buttonScope, a.P) {
1007                                 p.parseImpliedToken(StartTagToken, a.P, a.P.String())
1008                         }
1009                         p.popUntil(buttonScope, a.P)
1010                 case a.Li:
1011                         p.popUntil(listItemScope, a.Li)
1012                 case a.Dd, a.Dt:
1013                         p.popUntil(defaultScope, p.tok.DataAtom)
1014                 case a.H1, a.H2, a.H3, a.H4, a.H5, a.H6:
1015                         p.popUntil(defaultScope, a.H1, a.H2, a.H3, a.H4, a.H5, a.H6)
1016                 case a.A, a.B, a.Big, a.Code, a.Em, a.Font, a.I, a.Nobr, a.S, a.Small, a.Strike, a.Strong, a.Tt, a.U:
1017                         p.inBodyEndTagFormatting(p.tok.DataAtom)
1018                 case a.Applet, a.Marquee, a.Object:
1019                         if p.popUntil(defaultScope, p.tok.DataAtom) {
1020                                 p.clearActiveFormattingElements()
1021                         }
1022                 case a.Br:
1023                         p.tok.Type = StartTagToken
1024                         return false
1025                 default:
1026                         p.inBodyEndTagOther(p.tok.DataAtom)
1027                 }
1028         case CommentToken:
1029                 p.addChild(&Node{
1030                         Type: CommentNode,
1031                         Data: p.tok.Data,
1032                 })
1033         }
1034
1035         return true
1036 }
1037
1038 func (p *parser) inBodyEndTagFormatting(tagAtom a.Atom) {
1039         // This is the "adoption agency" algorithm, described at
1040         // https://html.spec.whatwg.org/multipage/syntax.html#adoptionAgency
1041
1042         // TODO: this is a fairly literal line-by-line translation of that algorithm.
1043         // Once the code successfully parses the comprehensive test suite, we should
1044         // refactor this code to be more idiomatic.
1045
1046         // Steps 1-4. The outer loop.
1047         for i := 0; i < 8; i++ {
1048                 // Step 5. Find the formatting element.
1049                 var formattingElement *Node
1050                 for j := len(p.afe) - 1; j >= 0; j-- {
1051                         if p.afe[j].Type == scopeMarkerNode {
1052                                 break
1053                         }
1054                         if p.afe[j].DataAtom == tagAtom {
1055                                 formattingElement = p.afe[j]
1056                                 break
1057                         }
1058                 }
1059                 if formattingElement == nil {
1060                         p.inBodyEndTagOther(tagAtom)
1061                         return
1062                 }
1063                 feIndex := p.oe.index(formattingElement)
1064                 if feIndex == -1 {
1065                         p.afe.remove(formattingElement)
1066                         return
1067                 }
1068                 if !p.elementInScope(defaultScope, tagAtom) {
1069                         // Ignore the tag.
1070                         return
1071                 }
1072
1073                 // Steps 9-10. Find the furthest block.
1074                 var furthestBlock *Node
1075                 for _, e := range p.oe[feIndex:] {
1076                         if isSpecialElement(e) {
1077                                 furthestBlock = e
1078                                 break
1079                         }
1080                 }
1081                 if furthestBlock == nil {
1082                         e := p.oe.pop()
1083                         for e != formattingElement {
1084                                 e = p.oe.pop()
1085                         }
1086                         p.afe.remove(e)
1087                         return
1088                 }
1089
1090                 // Steps 11-12. Find the common ancestor and bookmark node.
1091                 commonAncestor := p.oe[feIndex-1]
1092                 bookmark := p.afe.index(formattingElement)
1093
1094                 // Step 13. The inner loop. Find the lastNode to reparent.
1095                 lastNode := furthestBlock
1096                 node := furthestBlock
1097                 x := p.oe.index(node)
1098                 // Steps 13.1-13.2
1099                 for j := 0; j < 3; j++ {
1100                         // Step 13.3.
1101                         x--
1102                         node = p.oe[x]
1103                         // Step 13.4 - 13.5.
1104                         if p.afe.index(node) == -1 {
1105                                 p.oe.remove(node)
1106                                 continue
1107                         }
1108                         // Step 13.6.
1109                         if node == formattingElement {
1110                                 break
1111                         }
1112                         // Step 13.7.
1113                         clone := node.clone()
1114                         p.afe[p.afe.index(node)] = clone
1115                         p.oe[p.oe.index(node)] = clone
1116                         node = clone
1117                         // Step 13.8.
1118                         if lastNode == furthestBlock {
1119                                 bookmark = p.afe.index(node) + 1
1120                         }
1121                         // Step 13.9.
1122                         if lastNode.Parent != nil {
1123                                 lastNode.Parent.RemoveChild(lastNode)
1124                         }
1125                         node.AppendChild(lastNode)
1126                         // Step 13.10.
1127                         lastNode = node
1128                 }
1129
1130                 // Step 14. Reparent lastNode to the common ancestor,
1131                 // or for misnested table nodes, to the foster parent.
1132                 if lastNode.Parent != nil {
1133                         lastNode.Parent.RemoveChild(lastNode)
1134                 }
1135                 switch commonAncestor.DataAtom {
1136                 case a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr:
1137                         p.fosterParent(lastNode)
1138                 default:
1139                         commonAncestor.AppendChild(lastNode)
1140                 }
1141
1142                 // Steps 15-17. Reparent nodes from the furthest block's children
1143                 // to a clone of the formatting element.
1144                 clone := formattingElement.clone()
1145                 reparentChildren(clone, furthestBlock)
1146                 furthestBlock.AppendChild(clone)
1147
1148                 // Step 18. Fix up the list of active formatting elements.
1149                 if oldLoc := p.afe.index(formattingElement); oldLoc != -1 && oldLoc < bookmark {
1150                         // Move the bookmark with the rest of the list.
1151                         bookmark--
1152                 }
1153                 p.afe.remove(formattingElement)
1154                 p.afe.insert(bookmark, clone)
1155
1156                 // Step 19. Fix up the stack of open elements.
1157                 p.oe.remove(formattingElement)
1158                 p.oe.insert(p.oe.index(furthestBlock)+1, clone)
1159         }
1160 }
1161
1162 // inBodyEndTagOther performs the "any other end tag" algorithm for inBodyIM.
1163 // "Any other end tag" handling from 12.2.5.5 The rules for parsing tokens in foreign content
1164 // https://html.spec.whatwg.org/multipage/syntax.html#parsing-main-inforeign
1165 func (p *parser) inBodyEndTagOther(tagAtom a.Atom) {
1166         for i := len(p.oe) - 1; i >= 0; i-- {
1167                 if p.oe[i].DataAtom == tagAtom {
1168                         p.oe = p.oe[:i]
1169                         break
1170                 }
1171                 if isSpecialElement(p.oe[i]) {
1172                         break
1173                 }
1174         }
1175 }
1176
1177 // Section 12.2.5.4.8.
1178 func textIM(p *parser) bool {
1179         switch p.tok.Type {
1180         case ErrorToken:
1181                 p.oe.pop()
1182         case TextToken:
1183                 d := p.tok.Data
1184                 if n := p.oe.top(); n.DataAtom == a.Textarea && n.FirstChild == nil {
1185                         // Ignore a newline at the start of a <textarea> block.
1186                         if d != "" && d[0] == '\r' {
1187                                 d = d[1:]
1188                         }
1189                         if d != "" && d[0] == '\n' {
1190                                 d = d[1:]
1191                         }
1192                 }
1193                 if d == "" {
1194                         return true
1195                 }
1196                 p.addText(d)
1197                 return true
1198         case EndTagToken:
1199                 p.oe.pop()
1200         }
1201         p.im = p.originalIM
1202         p.originalIM = nil
1203         return p.tok.Type == EndTagToken
1204 }
1205
1206 // Section 12.2.5.4.9.
1207 func inTableIM(p *parser) bool {
1208         switch p.tok.Type {
1209         case ErrorToken:
1210                 // Stop parsing.
1211                 return true
1212         case TextToken:
1213                 p.tok.Data = strings.Replace(p.tok.Data, "\x00", "", -1)
1214                 switch p.oe.top().DataAtom {
1215                 case a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr:
1216                         if strings.Trim(p.tok.Data, whitespace) == "" {
1217                                 p.addText(p.tok.Data)
1218                                 return true
1219                         }
1220                 }
1221         case StartTagToken:
1222                 switch p.tok.DataAtom {
1223                 case a.Caption:
1224                         p.clearStackToContext(tableScope)
1225                         p.afe = append(p.afe, &scopeMarker)
1226                         p.addElement()
1227                         p.im = inCaptionIM
1228                         return true
1229                 case a.Colgroup:
1230                         p.clearStackToContext(tableScope)
1231                         p.addElement()
1232                         p.im = inColumnGroupIM
1233                         return true
1234                 case a.Col:
1235                         p.parseImpliedToken(StartTagToken, a.Colgroup, a.Colgroup.String())
1236                         return false
1237                 case a.Tbody, a.Tfoot, a.Thead:
1238                         p.clearStackToContext(tableScope)
1239                         p.addElement()
1240                         p.im = inTableBodyIM
1241                         return true
1242                 case a.Td, a.Th, a.Tr:
1243                         p.parseImpliedToken(StartTagToken, a.Tbody, a.Tbody.String())
1244                         return false
1245                 case a.Table:
1246                         if p.popUntil(tableScope, a.Table) {
1247                                 p.resetInsertionMode()
1248                                 return false
1249                         }
1250                         // Ignore the token.
1251                         return true
1252                 case a.Style, a.Script:
1253                         return inHeadIM(p)
1254                 case a.Input:
1255                         for _, t := range p.tok.Attr {
1256                                 if t.Key == "type" && strings.ToLower(t.Val) == "hidden" {
1257                                         p.addElement()
1258                                         p.oe.pop()
1259                                         return true
1260                                 }
1261                         }
1262                         // Otherwise drop down to the default action.
1263                 case a.Form:
1264                         if p.form != nil {
1265                                 // Ignore the token.
1266                                 return true
1267                         }
1268                         p.addElement()
1269                         p.form = p.oe.pop()
1270                 case a.Select:
1271                         p.reconstructActiveFormattingElements()
1272                         switch p.top().DataAtom {
1273                         case a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr:
1274                                 p.fosterParenting = true
1275                         }
1276                         p.addElement()
1277                         p.fosterParenting = false
1278                         p.framesetOK = false
1279                         p.im = inSelectInTableIM
1280                         return true
1281                 }
1282         case EndTagToken:
1283                 switch p.tok.DataAtom {
1284                 case a.Table:
1285                         if p.popUntil(tableScope, a.Table) {
1286                                 p.resetInsertionMode()
1287                                 return true
1288                         }
1289                         // Ignore the token.
1290                         return true
1291                 case a.Body, a.Caption, a.Col, a.Colgroup, a.Html, a.Tbody, a.Td, a.Tfoot, a.Th, a.Thead, a.Tr:
1292                         // Ignore the token.
1293                         return true
1294                 }
1295         case CommentToken:
1296                 p.addChild(&Node{
1297                         Type: CommentNode,
1298                         Data: p.tok.Data,
1299                 })
1300                 return true
1301         case DoctypeToken:
1302                 // Ignore the token.
1303                 return true
1304         }
1305
1306         p.fosterParenting = true
1307         defer func() { p.fosterParenting = false }()
1308
1309         return inBodyIM(p)
1310 }
1311
1312 // Section 12.2.5.4.11.
1313 func inCaptionIM(p *parser) bool {
1314         switch p.tok.Type {
1315         case StartTagToken:
1316                 switch p.tok.DataAtom {
1317                 case a.Caption, a.Col, a.Colgroup, a.Tbody, a.Td, a.Tfoot, a.Thead, a.Tr:
1318                         if p.popUntil(tableScope, a.Caption) {
1319                                 p.clearActiveFormattingElements()
1320                                 p.im = inTableIM
1321                                 return false
1322                         } else {
1323                                 // Ignore the token.
1324                                 return true
1325                         }
1326                 case a.Select:
1327                         p.reconstructActiveFormattingElements()
1328                         p.addElement()
1329                         p.framesetOK = false
1330                         p.im = inSelectInTableIM
1331                         return true
1332                 }
1333         case EndTagToken:
1334                 switch p.tok.DataAtom {
1335                 case a.Caption:
1336                         if p.popUntil(tableScope, a.Caption) {
1337                                 p.clearActiveFormattingElements()
1338                                 p.im = inTableIM
1339                         }
1340                         return true
1341                 case a.Table:
1342                         if p.popUntil(tableScope, a.Caption) {
1343                                 p.clearActiveFormattingElements()
1344                                 p.im = inTableIM
1345                                 return false
1346                         } else {
1347                                 // Ignore the token.
1348                                 return true
1349                         }
1350                 case a.Body, a.Col, a.Colgroup, a.Html, a.Tbody, a.Td, a.Tfoot, a.Th, a.Thead, a.Tr:
1351                         // Ignore the token.
1352                         return true
1353                 }
1354         }
1355         return inBodyIM(p)
1356 }
1357
1358 // Section 12.2.5.4.12.
1359 func inColumnGroupIM(p *parser) bool {
1360         switch p.tok.Type {
1361         case TextToken:
1362                 s := strings.TrimLeft(p.tok.Data, whitespace)
1363                 if len(s) < len(p.tok.Data) {
1364                         // Add the initial whitespace to the current node.
1365                         p.addText(p.tok.Data[:len(p.tok.Data)-len(s)])
1366                         if s == "" {
1367                                 return true
1368                         }
1369                         p.tok.Data = s
1370                 }
1371         case CommentToken:
1372                 p.addChild(&Node{
1373                         Type: CommentNode,
1374                         Data: p.tok.Data,
1375                 })
1376                 return true
1377         case DoctypeToken:
1378                 // Ignore the token.
1379                 return true
1380         case StartTagToken:
1381                 switch p.tok.DataAtom {
1382                 case a.Html:
1383                         return inBodyIM(p)
1384                 case a.Col:
1385                         p.addElement()
1386                         p.oe.pop()
1387                         p.acknowledgeSelfClosingTag()
1388                         return true
1389                 }
1390         case EndTagToken:
1391                 switch p.tok.DataAtom {
1392                 case a.Colgroup:
1393                         if p.oe.top().DataAtom != a.Html {
1394                                 p.oe.pop()
1395                                 p.im = inTableIM
1396                         }
1397                         return true
1398                 case a.Col:
1399                         // Ignore the token.
1400                         return true
1401                 }
1402         }
1403         if p.oe.top().DataAtom != a.Html {
1404                 p.oe.pop()
1405                 p.im = inTableIM
1406                 return false
1407         }
1408         return true
1409 }
1410
1411 // Section 12.2.5.4.13.
1412 func inTableBodyIM(p *parser) bool {
1413         switch p.tok.Type {
1414         case StartTagToken:
1415                 switch p.tok.DataAtom {
1416                 case a.Tr:
1417                         p.clearStackToContext(tableBodyScope)
1418                         p.addElement()
1419                         p.im = inRowIM
1420                         return true
1421                 case a.Td, a.Th:
1422                         p.parseImpliedToken(StartTagToken, a.Tr, a.Tr.String())
1423                         return false
1424                 case a.Caption, a.Col, a.Colgroup, a.Tbody, a.Tfoot, a.Thead:
1425                         if p.popUntil(tableScope, a.Tbody, a.Thead, a.Tfoot) {
1426                                 p.im = inTableIM
1427                                 return false
1428                         }
1429                         // Ignore the token.
1430                         return true
1431                 }
1432         case EndTagToken:
1433                 switch p.tok.DataAtom {
1434                 case a.Tbody, a.Tfoot, a.Thead:
1435                         if p.elementInScope(tableScope, p.tok.DataAtom) {
1436                                 p.clearStackToContext(tableBodyScope)
1437                                 p.oe.pop()
1438                                 p.im = inTableIM
1439                         }
1440                         return true
1441                 case a.Table:
1442                         if p.popUntil(tableScope, a.Tbody, a.Thead, a.Tfoot) {
1443                                 p.im = inTableIM
1444                                 return false
1445                         }
1446                         // Ignore the token.
1447                         return true
1448                 case a.Body, a.Caption, a.Col, a.Colgroup, a.Html, a.Td, a.Th, a.Tr:
1449                         // Ignore the token.
1450                         return true
1451                 }
1452         case CommentToken:
1453                 p.addChild(&Node{
1454                         Type: CommentNode,
1455                         Data: p.tok.Data,
1456                 })
1457                 return true
1458         }
1459
1460         return inTableIM(p)
1461 }
1462
1463 // Section 12.2.5.4.14.
1464 func inRowIM(p *parser) bool {
1465         switch p.tok.Type {
1466         case StartTagToken:
1467                 switch p.tok.DataAtom {
1468                 case a.Td, a.Th:
1469                         p.clearStackToContext(tableRowScope)
1470                         p.addElement()
1471                         p.afe = append(p.afe, &scopeMarker)
1472                         p.im = inCellIM
1473                         return true
1474                 case a.Caption, a.Col, a.Colgroup, a.Tbody, a.Tfoot, a.Thead, a.Tr:
1475                         if p.popUntil(tableScope, a.Tr) {
1476                                 p.im = inTableBodyIM
1477                                 return false
1478                         }
1479                         // Ignore the token.
1480                         return true
1481                 }
1482         case EndTagToken:
1483                 switch p.tok.DataAtom {
1484                 case a.Tr:
1485                         if p.popUntil(tableScope, a.Tr) {
1486                                 p.im = inTableBodyIM
1487                                 return true
1488                         }
1489                         // Ignore the token.
1490                         return true
1491                 case a.Table:
1492                         if p.popUntil(tableScope, a.Tr) {
1493                                 p.im = inTableBodyIM
1494                                 return false
1495                         }
1496                         // Ignore the token.
1497                         return true
1498                 case a.Tbody, a.Tfoot, a.Thead:
1499                         if p.elementInScope(tableScope, p.tok.DataAtom) {
1500                                 p.parseImpliedToken(EndTagToken, a.Tr, a.Tr.String())
1501                                 return false
1502                         }
1503                         // Ignore the token.
1504                         return true
1505                 case a.Body, a.Caption, a.Col, a.Colgroup, a.Html, a.Td, a.Th:
1506                         // Ignore the token.
1507                         return true
1508                 }
1509         }
1510
1511         return inTableIM(p)
1512 }
1513
1514 // Section 12.2.5.4.15.
1515 func inCellIM(p *parser) bool {
1516         switch p.tok.Type {
1517         case StartTagToken:
1518                 switch p.tok.DataAtom {
1519                 case a.Caption, a.Col, a.Colgroup, a.Tbody, a.Td, a.Tfoot, a.Th, a.Thead, a.Tr:
1520                         if p.popUntil(tableScope, a.Td, a.Th) {
1521                                 // Close the cell and reprocess.
1522                                 p.clearActiveFormattingElements()
1523                                 p.im = inRowIM
1524                                 return false
1525                         }
1526                         // Ignore the token.
1527                         return true
1528                 case a.Select:
1529                         p.reconstructActiveFormattingElements()
1530                         p.addElement()
1531                         p.framesetOK = false
1532                         p.im = inSelectInTableIM
1533                         return true
1534                 }
1535         case EndTagToken:
1536                 switch p.tok.DataAtom {
1537                 case a.Td, a.Th:
1538                         if !p.popUntil(tableScope, p.tok.DataAtom) {
1539                                 // Ignore the token.
1540                                 return true
1541                         }
1542                         p.clearActiveFormattingElements()
1543                         p.im = inRowIM
1544                         return true
1545                 case a.Body, a.Caption, a.Col, a.Colgroup, a.Html:
1546                         // Ignore the token.
1547                         return true
1548                 case a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr:
1549                         if !p.elementInScope(tableScope, p.tok.DataAtom) {
1550                                 // Ignore the token.
1551                                 return true
1552                         }
1553                         // Close the cell and reprocess.
1554                         p.popUntil(tableScope, a.Td, a.Th)
1555                         p.clearActiveFormattingElements()
1556                         p.im = inRowIM
1557                         return false
1558                 }
1559         }
1560         return inBodyIM(p)
1561 }
1562
1563 // Section 12.2.5.4.16.
1564 func inSelectIM(p *parser) bool {
1565         switch p.tok.Type {
1566         case ErrorToken:
1567                 // Stop parsing.
1568                 return true
1569         case TextToken:
1570                 p.addText(strings.Replace(p.tok.Data, "\x00", "", -1))
1571         case StartTagToken:
1572                 switch p.tok.DataAtom {
1573                 case a.Html:
1574                         return inBodyIM(p)
1575                 case a.Option:
1576                         if p.top().DataAtom == a.Option {
1577                                 p.oe.pop()
1578                         }
1579                         p.addElement()
1580                 case a.Optgroup:
1581                         if p.top().DataAtom == a.Option {
1582                                 p.oe.pop()
1583                         }
1584                         if p.top().DataAtom == a.Optgroup {
1585                                 p.oe.pop()
1586                         }
1587                         p.addElement()
1588                 case a.Select:
1589                         p.tok.Type = EndTagToken
1590                         return false
1591                 case a.Input, a.Keygen, a.Textarea:
1592                         if p.elementInScope(selectScope, a.Select) {
1593                                 p.parseImpliedToken(EndTagToken, a.Select, a.Select.String())
1594                                 return false
1595                         }
1596                         // In order to properly ignore <textarea>, we need to change the tokenizer mode.
1597                         p.tokenizer.NextIsNotRawText()
1598                         // Ignore the token.
1599                         return true
1600                 case a.Script:
1601                         return inHeadIM(p)
1602                 }
1603         case EndTagToken:
1604                 switch p.tok.DataAtom {
1605                 case a.Option:
1606                         if p.top().DataAtom == a.Option {
1607                                 p.oe.pop()
1608                         }
1609                 case a.Optgroup:
1610                         i := len(p.oe) - 1
1611                         if p.oe[i].DataAtom == a.Option {
1612                                 i--
1613                         }
1614                         if p.oe[i].DataAtom == a.Optgroup {
1615                                 p.oe = p.oe[:i]
1616                         }
1617                 case a.Select:
1618                         if p.popUntil(selectScope, a.Select) {
1619                                 p.resetInsertionMode()
1620                         }
1621                 }
1622         case CommentToken:
1623                 p.addChild(&Node{
1624                         Type: CommentNode,
1625                         Data: p.tok.Data,
1626                 })
1627         case DoctypeToken:
1628                 // Ignore the token.
1629                 return true
1630         }
1631
1632         return true
1633 }
1634
1635 // Section 12.2.5.4.17.
1636 func inSelectInTableIM(p *parser) bool {
1637         switch p.tok.Type {
1638         case StartTagToken, EndTagToken:
1639                 switch p.tok.DataAtom {
1640                 case a.Caption, a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr, a.Td, a.Th:
1641                         if p.tok.Type == StartTagToken || p.elementInScope(tableScope, p.tok.DataAtom) {
1642                                 p.parseImpliedToken(EndTagToken, a.Select, a.Select.String())
1643                                 return false
1644                         } else {
1645                                 // Ignore the token.
1646                                 return true
1647                         }
1648                 }
1649         }
1650         return inSelectIM(p)
1651 }
1652
1653 // Section 12.2.5.4.18.
1654 func afterBodyIM(p *parser) bool {
1655         switch p.tok.Type {
1656         case ErrorToken:
1657                 // Stop parsing.
1658                 return true
1659         case TextToken:
1660                 s := strings.TrimLeft(p.tok.Data, whitespace)
1661                 if len(s) == 0 {
1662                         // It was all whitespace.
1663                         return inBodyIM(p)
1664                 }
1665         case StartTagToken:
1666                 if p.tok.DataAtom == a.Html {
1667                         return inBodyIM(p)
1668                 }
1669         case EndTagToken:
1670                 if p.tok.DataAtom == a.Html {
1671                         if !p.fragment {
1672                                 p.im = afterAfterBodyIM
1673                         }
1674                         return true
1675                 }
1676         case CommentToken:
1677                 // The comment is attached to the <html> element.
1678                 if len(p.oe) < 1 || p.oe[0].DataAtom != a.Html {
1679                         panic("html: bad parser state: <html> element not found, in the after-body insertion mode")
1680                 }
1681                 p.oe[0].AppendChild(&Node{
1682                         Type: CommentNode,
1683                         Data: p.tok.Data,
1684                 })
1685                 return true
1686         }
1687         p.im = inBodyIM
1688         return false
1689 }
1690
1691 // Section 12.2.5.4.19.
1692 func inFramesetIM(p *parser) bool {
1693         switch p.tok.Type {
1694         case CommentToken:
1695                 p.addChild(&Node{
1696                         Type: CommentNode,
1697                         Data: p.tok.Data,
1698                 })
1699         case TextToken:
1700                 // Ignore all text but whitespace.
1701                 s := strings.Map(func(c rune) rune {
1702                         switch c {
1703                         case ' ', '\t', '\n', '\f', '\r':
1704                                 return c
1705                         }
1706                         return -1
1707                 }, p.tok.Data)
1708                 if s != "" {
1709                         p.addText(s)
1710                 }
1711         case StartTagToken:
1712                 switch p.tok.DataAtom {
1713                 case a.Html:
1714                         return inBodyIM(p)
1715                 case a.Frameset:
1716                         p.addElement()
1717                 case a.Frame:
1718                         p.addElement()
1719                         p.oe.pop()
1720                         p.acknowledgeSelfClosingTag()
1721                 case a.Noframes:
1722                         return inHeadIM(p)
1723                 }
1724         case EndTagToken:
1725                 switch p.tok.DataAtom {
1726                 case a.Frameset:
1727                         if p.oe.top().DataAtom != a.Html {
1728                                 p.oe.pop()
1729                                 if p.oe.top().DataAtom != a.Frameset {
1730                                         p.im = afterFramesetIM
1731                                         return true
1732                                 }
1733                         }
1734                 }
1735         default:
1736                 // Ignore the token.
1737         }
1738         return true
1739 }
1740
1741 // Section 12.2.5.4.20.
1742 func afterFramesetIM(p *parser) bool {
1743         switch p.tok.Type {
1744         case CommentToken:
1745                 p.addChild(&Node{
1746                         Type: CommentNode,
1747                         Data: p.tok.Data,
1748                 })
1749         case TextToken:
1750                 // Ignore all text but whitespace.
1751                 s := strings.Map(func(c rune) rune {
1752                         switch c {
1753                         case ' ', '\t', '\n', '\f', '\r':
1754                                 return c
1755                         }
1756                         return -1
1757                 }, p.tok.Data)
1758                 if s != "" {
1759                         p.addText(s)
1760                 }
1761         case StartTagToken:
1762                 switch p.tok.DataAtom {
1763                 case a.Html:
1764                         return inBodyIM(p)
1765                 case a.Noframes:
1766                         return inHeadIM(p)
1767                 }
1768         case EndTagToken:
1769                 switch p.tok.DataAtom {
1770                 case a.Html:
1771                         p.im = afterAfterFramesetIM
1772                         return true
1773                 }
1774         default:
1775                 // Ignore the token.
1776         }
1777         return true
1778 }
1779
1780 // Section 12.2.5.4.21.
1781 func afterAfterBodyIM(p *parser) bool {
1782         switch p.tok.Type {
1783         case ErrorToken:
1784                 // Stop parsing.
1785                 return true
1786         case TextToken:
1787                 s := strings.TrimLeft(p.tok.Data, whitespace)
1788                 if len(s) == 0 {
1789                         // It was all whitespace.
1790                         return inBodyIM(p)
1791                 }
1792         case StartTagToken:
1793                 if p.tok.DataAtom == a.Html {
1794                         return inBodyIM(p)
1795                 }
1796         case CommentToken:
1797                 p.doc.AppendChild(&Node{
1798                         Type: CommentNode,
1799                         Data: p.tok.Data,
1800                 })
1801                 return true
1802         case DoctypeToken:
1803                 return inBodyIM(p)
1804         }
1805         p.im = inBodyIM
1806         return false
1807 }
1808
1809 // Section 12.2.5.4.22.
1810 func afterAfterFramesetIM(p *parser) bool {
1811         switch p.tok.Type {
1812         case CommentToken:
1813                 p.doc.AppendChild(&Node{
1814                         Type: CommentNode,
1815                         Data: p.tok.Data,
1816                 })
1817         case TextToken:
1818                 // Ignore all text but whitespace.
1819                 s := strings.Map(func(c rune) rune {
1820                         switch c {
1821                         case ' ', '\t', '\n', '\f', '\r':
1822                                 return c
1823                         }
1824                         return -1
1825                 }, p.tok.Data)
1826                 if s != "" {
1827                         p.tok.Data = s
1828                         return inBodyIM(p)
1829                 }
1830         case StartTagToken:
1831                 switch p.tok.DataAtom {
1832                 case a.Html:
1833                         return inBodyIM(p)
1834                 case a.Noframes:
1835                         return inHeadIM(p)
1836                 }
1837         case DoctypeToken:
1838                 return inBodyIM(p)
1839         default:
1840                 // Ignore the token.
1841         }
1842         return true
1843 }
1844
1845 const whitespaceOrNUL = whitespace + "\x00"
1846
1847 // Section 12.2.5.5.
1848 func parseForeignContent(p *parser) bool {
1849         switch p.tok.Type {
1850         case TextToken:
1851                 if p.framesetOK {
1852                         p.framesetOK = strings.TrimLeft(p.tok.Data, whitespaceOrNUL) == ""
1853                 }
1854                 p.tok.Data = strings.Replace(p.tok.Data, "\x00", "\ufffd", -1)
1855                 p.addText(p.tok.Data)
1856         case CommentToken:
1857                 p.addChild(&Node{
1858                         Type: CommentNode,
1859                         Data: p.tok.Data,
1860                 })
1861         case StartTagToken:
1862                 b := breakout[p.tok.Data]
1863                 if p.tok.DataAtom == a.Font {
1864                 loop:
1865                         for _, attr := range p.tok.Attr {
1866                                 switch attr.Key {
1867                                 case "color", "face", "size":
1868                                         b = true
1869                                         break loop
1870                                 }
1871                         }
1872                 }
1873                 if b {
1874                         for i := len(p.oe) - 1; i >= 0; i-- {
1875                                 n := p.oe[i]
1876                                 if n.Namespace == "" || htmlIntegrationPoint(n) || mathMLTextIntegrationPoint(n) {
1877                                         p.oe = p.oe[:i+1]
1878                                         break
1879                                 }
1880                         }
1881                         return false
1882                 }
1883                 switch p.top().Namespace {
1884                 case "math":
1885                         adjustAttributeNames(p.tok.Attr, mathMLAttributeAdjustments)
1886                 case "svg":
1887                         // Adjust SVG tag names. The tokenizer lower-cases tag names, but
1888                         // SVG wants e.g. "foreignObject" with a capital second "O".
1889                         if x := svgTagNameAdjustments[p.tok.Data]; x != "" {
1890                                 p.tok.DataAtom = a.Lookup([]byte(x))
1891                                 p.tok.Data = x
1892                         }
1893                         adjustAttributeNames(p.tok.Attr, svgAttributeAdjustments)
1894                 default:
1895                         panic("html: bad parser state: unexpected namespace")
1896                 }
1897                 adjustForeignAttributes(p.tok.Attr)
1898                 namespace := p.top().Namespace
1899                 p.addElement()
1900                 p.top().Namespace = namespace
1901                 if namespace != "" {
1902                         // Don't let the tokenizer go into raw text mode in foreign content
1903                         // (e.g. in an SVG <title> tag).
1904                         p.tokenizer.NextIsNotRawText()
1905                 }
1906                 if p.hasSelfClosingToken {
1907                         p.oe.pop()
1908                         p.acknowledgeSelfClosingTag()
1909                 }
1910         case EndTagToken:
1911                 for i := len(p.oe) - 1; i >= 0; i-- {
1912                         if p.oe[i].Namespace == "" {
1913                                 return p.im(p)
1914                         }
1915                         if strings.EqualFold(p.oe[i].Data, p.tok.Data) {
1916                                 p.oe = p.oe[:i]
1917                                 break
1918                         }
1919                 }
1920                 return true
1921         default:
1922                 // Ignore the token.
1923         }
1924         return true
1925 }
1926
1927 // Section 12.2.5.
1928 func (p *parser) inForeignContent() bool {
1929         if len(p.oe) == 0 {
1930                 return false
1931         }
1932         n := p.oe[len(p.oe)-1]
1933         if n.Namespace == "" {
1934                 return false
1935         }
1936         if mathMLTextIntegrationPoint(n) {
1937                 if p.tok.Type == StartTagToken && p.tok.DataAtom != a.Mglyph && p.tok.DataAtom != a.Malignmark {
1938                         return false
1939                 }
1940                 if p.tok.Type == TextToken {
1941                         return false
1942                 }
1943         }
1944         if n.Namespace == "math" && n.DataAtom == a.AnnotationXml && p.tok.Type == StartTagToken && p.tok.DataAtom == a.Svg {
1945                 return false
1946         }
1947         if htmlIntegrationPoint(n) && (p.tok.Type == StartTagToken || p.tok.Type == TextToken) {
1948                 return false
1949         }
1950         if p.tok.Type == ErrorToken {
1951                 return false
1952         }
1953         return true
1954 }
1955
1956 // parseImpliedToken parses a token as though it had appeared in the parser's
1957 // input.
1958 func (p *parser) parseImpliedToken(t TokenType, dataAtom a.Atom, data string) {
1959         realToken, selfClosing := p.tok, p.hasSelfClosingToken
1960         p.tok = Token{
1961                 Type:     t,
1962                 DataAtom: dataAtom,
1963                 Data:     data,
1964         }
1965         p.hasSelfClosingToken = false
1966         p.parseCurrentToken()
1967         p.tok, p.hasSelfClosingToken = realToken, selfClosing
1968 }
1969
1970 // parseCurrentToken runs the current token through the parsing routines
1971 // until it is consumed.
1972 func (p *parser) parseCurrentToken() {
1973         if p.tok.Type == SelfClosingTagToken {
1974                 p.hasSelfClosingToken = true
1975                 p.tok.Type = StartTagToken
1976         }
1977
1978         consumed := false
1979         for !consumed {
1980                 if p.inForeignContent() {
1981                         consumed = parseForeignContent(p)
1982                 } else {
1983                         consumed = p.im(p)
1984                 }
1985         }
1986
1987         if p.hasSelfClosingToken {
1988                 // This is a parse error, but ignore it.
1989                 p.hasSelfClosingToken = false
1990         }
1991 }
1992
1993 func (p *parser) parse() error {
1994         // Iterate until EOF. Any other error will cause an early return.
1995         var err error
1996         for err != io.EOF {
1997                 // CDATA sections are allowed only in foreign content.
1998                 n := p.oe.top()
1999                 p.tokenizer.AllowCDATA(n != nil && n.Namespace != "")
2000                 // Read and parse the next token.
2001                 p.tokenizer.Next()
2002                 p.tok = p.tokenizer.Token()
2003                 if p.tok.Type == ErrorToken {
2004                         err = p.tokenizer.Err()
2005                         if err != nil && err != io.EOF {
2006                                 return err
2007                         }
2008                 }
2009                 p.parseCurrentToken()
2010         }
2011         return nil
2012 }
2013
2014 // Parse returns the parse tree for the HTML from the given Reader.
2015 // The input is assumed to be UTF-8 encoded.
2016 func Parse(r io.Reader) (*Node, error) {
2017         p := &parser{
2018                 tokenizer: NewTokenizer(r),
2019                 doc: &Node{
2020                         Type: DocumentNode,
2021                 },
2022                 scripting:  true,
2023                 framesetOK: true,
2024                 im:         initialIM,
2025         }
2026         err := p.parse()
2027         if err != nil {
2028                 return nil, err
2029         }
2030         return p.doc, nil
2031 }
2032
2033 // ParseFragment parses a fragment of HTML and returns the nodes that were
2034 // found. If the fragment is the InnerHTML for an existing element, pass that
2035 // element in context.
2036 func ParseFragment(r io.Reader, context *Node) ([]*Node, error) {
2037         contextTag := ""
2038         if context != nil {
2039                 if context.Type != ElementNode {
2040                         return nil, errors.New("html: ParseFragment of non-element Node")
2041                 }
2042                 // The next check isn't just context.DataAtom.String() == context.Data because
2043                 // it is valid to pass an element whose tag isn't a known atom. For example,
2044                 // DataAtom == 0 and Data = "tagfromthefuture" is perfectly consistent.
2045                 if context.DataAtom != a.Lookup([]byte(context.Data)) {
2046                         return nil, fmt.Errorf("html: inconsistent Node: DataAtom=%q, Data=%q", context.DataAtom, context.Data)
2047                 }
2048                 contextTag = context.DataAtom.String()
2049         }
2050         p := &parser{
2051                 tokenizer: NewTokenizerFragment(r, contextTag),
2052                 doc: &Node{
2053                         Type: DocumentNode,
2054                 },
2055                 scripting: true,
2056                 fragment:  true,
2057                 context:   context,
2058         }
2059
2060         root := &Node{
2061                 Type:     ElementNode,
2062                 DataAtom: a.Html,
2063                 Data:     a.Html.String(),
2064         }
2065         p.doc.AppendChild(root)
2066         p.oe = nodeStack{root}
2067         p.resetInsertionMode()
2068
2069         for n := context; n != nil; n = n.Parent {
2070                 if n.Type == ElementNode && n.DataAtom == a.Form {
2071                         p.form = n
2072                         break
2073                 }
2074         }
2075
2076         err := p.parse()
2077         if err != nil {
2078                 return nil, err
2079         }
2080
2081         parent := p.doc
2082         if context != nil {
2083                 parent = root
2084         }
2085
2086         var result []*Node
2087         for c := parent.FirstChild; c != nil; {
2088                 next := c.NextSibling
2089                 parent.RemoveChild(c)
2090                 result = append(result, c)
2091                 c = next
2092         }
2093         return result, nil
2094 }