OSDN Git Service

new repo
[bytom/vapor.git] / vendor / golang.org / x / net / webdav / internal / xml / xml.go
1 // Copyright 2009 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 // Package xml implements a simple XML 1.0 parser that
6 // understands XML name spaces.
7 package xml
8
9 // References:
10 //    Annotated XML spec: http://www.xml.com/axml/testaxml.htm
11 //    XML name spaces: http://www.w3.org/TR/REC-xml-names/
12
13 // TODO(rsc):
14 //      Test error handling.
15
16 import (
17         "bufio"
18         "bytes"
19         "errors"
20         "fmt"
21         "io"
22         "strconv"
23         "strings"
24         "unicode"
25         "unicode/utf8"
26 )
27
28 // A SyntaxError represents a syntax error in the XML input stream.
29 type SyntaxError struct {
30         Msg  string
31         Line int
32 }
33
34 func (e *SyntaxError) Error() string {
35         return "XML syntax error on line " + strconv.Itoa(e.Line) + ": " + e.Msg
36 }
37
38 // A Name represents an XML name (Local) annotated with a name space
39 // identifier (Space). In tokens returned by Decoder.Token, the Space
40 // identifier is given as a canonical URL, not the short prefix used in
41 // the document being parsed.
42 //
43 // As a special case, XML namespace declarations will use the literal
44 // string "xmlns" for the Space field instead of the fully resolved URL.
45 // See Encoder.EncodeToken for more information on namespace encoding
46 // behaviour.
47 type Name struct {
48         Space, Local string
49 }
50
51 // isNamespace reports whether the name is a namespace-defining name.
52 func (name Name) isNamespace() bool {
53         return name.Local == "xmlns" || name.Space == "xmlns"
54 }
55
56 // An Attr represents an attribute in an XML element (Name=Value).
57 type Attr struct {
58         Name  Name
59         Value string
60 }
61
62 // A Token is an interface holding one of the token types:
63 // StartElement, EndElement, CharData, Comment, ProcInst, or Directive.
64 type Token interface{}
65
66 // A StartElement represents an XML start element.
67 type StartElement struct {
68         Name Name
69         Attr []Attr
70 }
71
72 func (e StartElement) Copy() StartElement {
73         attrs := make([]Attr, len(e.Attr))
74         copy(attrs, e.Attr)
75         e.Attr = attrs
76         return e
77 }
78
79 // End returns the corresponding XML end element.
80 func (e StartElement) End() EndElement {
81         return EndElement{e.Name}
82 }
83
84 // setDefaultNamespace sets the namespace of the element
85 // as the default for all elements contained within it.
86 func (e *StartElement) setDefaultNamespace() {
87         if e.Name.Space == "" {
88                 // If there's no namespace on the element, don't
89                 // set the default. Strictly speaking this might be wrong, as
90                 // we can't tell if the element had no namespace set
91                 // or was just using the default namespace.
92                 return
93         }
94         // Don't add a default name space if there's already one set.
95         for _, attr := range e.Attr {
96                 if attr.Name.Space == "" && attr.Name.Local == "xmlns" {
97                         return
98                 }
99         }
100         e.Attr = append(e.Attr, Attr{
101                 Name: Name{
102                         Local: "xmlns",
103                 },
104                 Value: e.Name.Space,
105         })
106 }
107
108 // An EndElement represents an XML end element.
109 type EndElement struct {
110         Name Name
111 }
112
113 // A CharData represents XML character data (raw text),
114 // in which XML escape sequences have been replaced by
115 // the characters they represent.
116 type CharData []byte
117
118 func makeCopy(b []byte) []byte {
119         b1 := make([]byte, len(b))
120         copy(b1, b)
121         return b1
122 }
123
124 func (c CharData) Copy() CharData { return CharData(makeCopy(c)) }
125
126 // A Comment represents an XML comment of the form <!--comment-->.
127 // The bytes do not include the <!-- and --> comment markers.
128 type Comment []byte
129
130 func (c Comment) Copy() Comment { return Comment(makeCopy(c)) }
131
132 // A ProcInst represents an XML processing instruction of the form <?target inst?>
133 type ProcInst struct {
134         Target string
135         Inst   []byte
136 }
137
138 func (p ProcInst) Copy() ProcInst {
139         p.Inst = makeCopy(p.Inst)
140         return p
141 }
142
143 // A Directive represents an XML directive of the form <!text>.
144 // The bytes do not include the <! and > markers.
145 type Directive []byte
146
147 func (d Directive) Copy() Directive { return Directive(makeCopy(d)) }
148
149 // CopyToken returns a copy of a Token.
150 func CopyToken(t Token) Token {
151         switch v := t.(type) {
152         case CharData:
153                 return v.Copy()
154         case Comment:
155                 return v.Copy()
156         case Directive:
157                 return v.Copy()
158         case ProcInst:
159                 return v.Copy()
160         case StartElement:
161                 return v.Copy()
162         }
163         return t
164 }
165
166 // A Decoder represents an XML parser reading a particular input stream.
167 // The parser assumes that its input is encoded in UTF-8.
168 type Decoder struct {
169         // Strict defaults to true, enforcing the requirements
170         // of the XML specification.
171         // If set to false, the parser allows input containing common
172         // mistakes:
173         //      * If an element is missing an end tag, the parser invents
174         //        end tags as necessary to keep the return values from Token
175         //        properly balanced.
176         //      * In attribute values and character data, unknown or malformed
177         //        character entities (sequences beginning with &) are left alone.
178         //
179         // Setting:
180         //
181         //      d.Strict = false;
182         //      d.AutoClose = HTMLAutoClose;
183         //      d.Entity = HTMLEntity
184         //
185         // creates a parser that can handle typical HTML.
186         //
187         // Strict mode does not enforce the requirements of the XML name spaces TR.
188         // In particular it does not reject name space tags using undefined prefixes.
189         // Such tags are recorded with the unknown prefix as the name space URL.
190         Strict bool
191
192         // When Strict == false, AutoClose indicates a set of elements to
193         // consider closed immediately after they are opened, regardless
194         // of whether an end element is present.
195         AutoClose []string
196
197         // Entity can be used to map non-standard entity names to string replacements.
198         // The parser behaves as if these standard mappings are present in the map,
199         // regardless of the actual map content:
200         //
201         //      "lt": "<",
202         //      "gt": ">",
203         //      "amp": "&",
204         //      "apos": "'",
205         //      "quot": `"`,
206         Entity map[string]string
207
208         // CharsetReader, if non-nil, defines a function to generate
209         // charset-conversion readers, converting from the provided
210         // non-UTF-8 charset into UTF-8. If CharsetReader is nil or
211         // returns an error, parsing stops with an error. One of the
212         // the CharsetReader's result values must be non-nil.
213         CharsetReader func(charset string, input io.Reader) (io.Reader, error)
214
215         // DefaultSpace sets the default name space used for unadorned tags,
216         // as if the entire XML stream were wrapped in an element containing
217         // the attribute xmlns="DefaultSpace".
218         DefaultSpace string
219
220         r              io.ByteReader
221         buf            bytes.Buffer
222         saved          *bytes.Buffer
223         stk            *stack
224         free           *stack
225         needClose      bool
226         toClose        Name
227         nextToken      Token
228         nextByte       int
229         ns             map[string]string
230         err            error
231         line           int
232         offset         int64
233         unmarshalDepth int
234 }
235
236 // NewDecoder creates a new XML parser reading from r.
237 // If r does not implement io.ByteReader, NewDecoder will
238 // do its own buffering.
239 func NewDecoder(r io.Reader) *Decoder {
240         d := &Decoder{
241                 ns:       make(map[string]string),
242                 nextByte: -1,
243                 line:     1,
244                 Strict:   true,
245         }
246         d.switchToReader(r)
247         return d
248 }
249
250 // Token returns the next XML token in the input stream.
251 // At the end of the input stream, Token returns nil, io.EOF.
252 //
253 // Slices of bytes in the returned token data refer to the
254 // parser's internal buffer and remain valid only until the next
255 // call to Token. To acquire a copy of the bytes, call CopyToken
256 // or the token's Copy method.
257 //
258 // Token expands self-closing elements such as <br/>
259 // into separate start and end elements returned by successive calls.
260 //
261 // Token guarantees that the StartElement and EndElement
262 // tokens it returns are properly nested and matched:
263 // if Token encounters an unexpected end element,
264 // it will return an error.
265 //
266 // Token implements XML name spaces as described by
267 // http://www.w3.org/TR/REC-xml-names/.  Each of the
268 // Name structures contained in the Token has the Space
269 // set to the URL identifying its name space when known.
270 // If Token encounters an unrecognized name space prefix,
271 // it uses the prefix as the Space rather than report an error.
272 func (d *Decoder) Token() (t Token, err error) {
273         if d.stk != nil && d.stk.kind == stkEOF {
274                 err = io.EOF
275                 return
276         }
277         if d.nextToken != nil {
278                 t = d.nextToken
279                 d.nextToken = nil
280         } else if t, err = d.rawToken(); err != nil {
281                 return
282         }
283
284         if !d.Strict {
285                 if t1, ok := d.autoClose(t); ok {
286                         d.nextToken = t
287                         t = t1
288                 }
289         }
290         switch t1 := t.(type) {
291         case StartElement:
292                 // In XML name spaces, the translations listed in the
293                 // attributes apply to the element name and
294                 // to the other attribute names, so process
295                 // the translations first.
296                 for _, a := range t1.Attr {
297                         if a.Name.Space == "xmlns" {
298                                 v, ok := d.ns[a.Name.Local]
299                                 d.pushNs(a.Name.Local, v, ok)
300                                 d.ns[a.Name.Local] = a.Value
301                         }
302                         if a.Name.Space == "" && a.Name.Local == "xmlns" {
303                                 // Default space for untagged names
304                                 v, ok := d.ns[""]
305                                 d.pushNs("", v, ok)
306                                 d.ns[""] = a.Value
307                         }
308                 }
309
310                 d.translate(&t1.Name, true)
311                 for i := range t1.Attr {
312                         d.translate(&t1.Attr[i].Name, false)
313                 }
314                 d.pushElement(t1.Name)
315                 t = t1
316
317         case EndElement:
318                 d.translate(&t1.Name, true)
319                 if !d.popElement(&t1) {
320                         return nil, d.err
321                 }
322                 t = t1
323         }
324         return
325 }
326
327 const xmlURL = "http://www.w3.org/XML/1998/namespace"
328
329 // Apply name space translation to name n.
330 // The default name space (for Space=="")
331 // applies only to element names, not to attribute names.
332 func (d *Decoder) translate(n *Name, isElementName bool) {
333         switch {
334         case n.Space == "xmlns":
335                 return
336         case n.Space == "" && !isElementName:
337                 return
338         case n.Space == "xml":
339                 n.Space = xmlURL
340         case n.Space == "" && n.Local == "xmlns":
341                 return
342         }
343         if v, ok := d.ns[n.Space]; ok {
344                 n.Space = v
345         } else if n.Space == "" {
346                 n.Space = d.DefaultSpace
347         }
348 }
349
350 func (d *Decoder) switchToReader(r io.Reader) {
351         // Get efficient byte at a time reader.
352         // Assume that if reader has its own
353         // ReadByte, it's efficient enough.
354         // Otherwise, use bufio.
355         if rb, ok := r.(io.ByteReader); ok {
356                 d.r = rb
357         } else {
358                 d.r = bufio.NewReader(r)
359         }
360 }
361
362 // Parsing state - stack holds old name space translations
363 // and the current set of open elements. The translations to pop when
364 // ending a given tag are *below* it on the stack, which is
365 // more work but forced on us by XML.
366 type stack struct {
367         next *stack
368         kind int
369         name Name
370         ok   bool
371 }
372
373 const (
374         stkStart = iota
375         stkNs
376         stkEOF
377 )
378
379 func (d *Decoder) push(kind int) *stack {
380         s := d.free
381         if s != nil {
382                 d.free = s.next
383         } else {
384                 s = new(stack)
385         }
386         s.next = d.stk
387         s.kind = kind
388         d.stk = s
389         return s
390 }
391
392 func (d *Decoder) pop() *stack {
393         s := d.stk
394         if s != nil {
395                 d.stk = s.next
396                 s.next = d.free
397                 d.free = s
398         }
399         return s
400 }
401
402 // Record that after the current element is finished
403 // (that element is already pushed on the stack)
404 // Token should return EOF until popEOF is called.
405 func (d *Decoder) pushEOF() {
406         // Walk down stack to find Start.
407         // It might not be the top, because there might be stkNs
408         // entries above it.
409         start := d.stk
410         for start.kind != stkStart {
411                 start = start.next
412         }
413         // The stkNs entries below a start are associated with that
414         // element too; skip over them.
415         for start.next != nil && start.next.kind == stkNs {
416                 start = start.next
417         }
418         s := d.free
419         if s != nil {
420                 d.free = s.next
421         } else {
422                 s = new(stack)
423         }
424         s.kind = stkEOF
425         s.next = start.next
426         start.next = s
427 }
428
429 // Undo a pushEOF.
430 // The element must have been finished, so the EOF should be at the top of the stack.
431 func (d *Decoder) popEOF() bool {
432         if d.stk == nil || d.stk.kind != stkEOF {
433                 return false
434         }
435         d.pop()
436         return true
437 }
438
439 // Record that we are starting an element with the given name.
440 func (d *Decoder) pushElement(name Name) {
441         s := d.push(stkStart)
442         s.name = name
443 }
444
445 // Record that we are changing the value of ns[local].
446 // The old value is url, ok.
447 func (d *Decoder) pushNs(local string, url string, ok bool) {
448         s := d.push(stkNs)
449         s.name.Local = local
450         s.name.Space = url
451         s.ok = ok
452 }
453
454 // Creates a SyntaxError with the current line number.
455 func (d *Decoder) syntaxError(msg string) error {
456         return &SyntaxError{Msg: msg, Line: d.line}
457 }
458
459 // Record that we are ending an element with the given name.
460 // The name must match the record at the top of the stack,
461 // which must be a pushElement record.
462 // After popping the element, apply any undo records from
463 // the stack to restore the name translations that existed
464 // before we saw this element.
465 func (d *Decoder) popElement(t *EndElement) bool {
466         s := d.pop()
467         name := t.Name
468         switch {
469         case s == nil || s.kind != stkStart:
470                 d.err = d.syntaxError("unexpected end element </" + name.Local + ">")
471                 return false
472         case s.name.Local != name.Local:
473                 if !d.Strict {
474                         d.needClose = true
475                         d.toClose = t.Name
476                         t.Name = s.name
477                         return true
478                 }
479                 d.err = d.syntaxError("element <" + s.name.Local + "> closed by </" + name.Local + ">")
480                 return false
481         case s.name.Space != name.Space:
482                 d.err = d.syntaxError("element <" + s.name.Local + "> in space " + s.name.Space +
483                         "closed by </" + name.Local + "> in space " + name.Space)
484                 return false
485         }
486
487         // Pop stack until a Start or EOF is on the top, undoing the
488         // translations that were associated with the element we just closed.
489         for d.stk != nil && d.stk.kind != stkStart && d.stk.kind != stkEOF {
490                 s := d.pop()
491                 if s.ok {
492                         d.ns[s.name.Local] = s.name.Space
493                 } else {
494                         delete(d.ns, s.name.Local)
495                 }
496         }
497
498         return true
499 }
500
501 // If the top element on the stack is autoclosing and
502 // t is not the end tag, invent the end tag.
503 func (d *Decoder) autoClose(t Token) (Token, bool) {
504         if d.stk == nil || d.stk.kind != stkStart {
505                 return nil, false
506         }
507         name := strings.ToLower(d.stk.name.Local)
508         for _, s := range d.AutoClose {
509                 if strings.ToLower(s) == name {
510                         // This one should be auto closed if t doesn't close it.
511                         et, ok := t.(EndElement)
512                         if !ok || et.Name.Local != name {
513                                 return EndElement{d.stk.name}, true
514                         }
515                         break
516                 }
517         }
518         return nil, false
519 }
520
521 var errRawToken = errors.New("xml: cannot use RawToken from UnmarshalXML method")
522
523 // RawToken is like Token but does not verify that
524 // start and end elements match and does not translate
525 // name space prefixes to their corresponding URLs.
526 func (d *Decoder) RawToken() (Token, error) {
527         if d.unmarshalDepth > 0 {
528                 return nil, errRawToken
529         }
530         return d.rawToken()
531 }
532
533 func (d *Decoder) rawToken() (Token, error) {
534         if d.err != nil {
535                 return nil, d.err
536         }
537         if d.needClose {
538                 // The last element we read was self-closing and
539                 // we returned just the StartElement half.
540                 // Return the EndElement half now.
541                 d.needClose = false
542                 return EndElement{d.toClose}, nil
543         }
544
545         b, ok := d.getc()
546         if !ok {
547                 return nil, d.err
548         }
549
550         if b != '<' {
551                 // Text section.
552                 d.ungetc(b)
553                 data := d.text(-1, false)
554                 if data == nil {
555                         return nil, d.err
556                 }
557                 return CharData(data), nil
558         }
559
560         if b, ok = d.mustgetc(); !ok {
561                 return nil, d.err
562         }
563         switch b {
564         case '/':
565                 // </: End element
566                 var name Name
567                 if name, ok = d.nsname(); !ok {
568                         if d.err == nil {
569                                 d.err = d.syntaxError("expected element name after </")
570                         }
571                         return nil, d.err
572                 }
573                 d.space()
574                 if b, ok = d.mustgetc(); !ok {
575                         return nil, d.err
576                 }
577                 if b != '>' {
578                         d.err = d.syntaxError("invalid characters between </" + name.Local + " and >")
579                         return nil, d.err
580                 }
581                 return EndElement{name}, nil
582
583         case '?':
584                 // <?: Processing instruction.
585                 var target string
586                 if target, ok = d.name(); !ok {
587                         if d.err == nil {
588                                 d.err = d.syntaxError("expected target name after <?")
589                         }
590                         return nil, d.err
591                 }
592                 d.space()
593                 d.buf.Reset()
594                 var b0 byte
595                 for {
596                         if b, ok = d.mustgetc(); !ok {
597                                 return nil, d.err
598                         }
599                         d.buf.WriteByte(b)
600                         if b0 == '?' && b == '>' {
601                                 break
602                         }
603                         b0 = b
604                 }
605                 data := d.buf.Bytes()
606                 data = data[0 : len(data)-2] // chop ?>
607
608                 if target == "xml" {
609                         content := string(data)
610                         ver := procInst("version", content)
611                         if ver != "" && ver != "1.0" {
612                                 d.err = fmt.Errorf("xml: unsupported version %q; only version 1.0 is supported", ver)
613                                 return nil, d.err
614                         }
615                         enc := procInst("encoding", content)
616                         if enc != "" && enc != "utf-8" && enc != "UTF-8" {
617                                 if d.CharsetReader == nil {
618                                         d.err = fmt.Errorf("xml: encoding %q declared but Decoder.CharsetReader is nil", enc)
619                                         return nil, d.err
620                                 }
621                                 newr, err := d.CharsetReader(enc, d.r.(io.Reader))
622                                 if err != nil {
623                                         d.err = fmt.Errorf("xml: opening charset %q: %v", enc, err)
624                                         return nil, d.err
625                                 }
626                                 if newr == nil {
627                                         panic("CharsetReader returned a nil Reader for charset " + enc)
628                                 }
629                                 d.switchToReader(newr)
630                         }
631                 }
632                 return ProcInst{target, data}, nil
633
634         case '!':
635                 // <!: Maybe comment, maybe CDATA.
636                 if b, ok = d.mustgetc(); !ok {
637                         return nil, d.err
638                 }
639                 switch b {
640                 case '-': // <!-
641                         // Probably <!-- for a comment.
642                         if b, ok = d.mustgetc(); !ok {
643                                 return nil, d.err
644                         }
645                         if b != '-' {
646                                 d.err = d.syntaxError("invalid sequence <!- not part of <!--")
647                                 return nil, d.err
648                         }
649                         // Look for terminator.
650                         d.buf.Reset()
651                         var b0, b1 byte
652                         for {
653                                 if b, ok = d.mustgetc(); !ok {
654                                         return nil, d.err
655                                 }
656                                 d.buf.WriteByte(b)
657                                 if b0 == '-' && b1 == '-' && b == '>' {
658                                         break
659                                 }
660                                 b0, b1 = b1, b
661                         }
662                         data := d.buf.Bytes()
663                         data = data[0 : len(data)-3] // chop -->
664                         return Comment(data), nil
665
666                 case '[': // <![
667                         // Probably <![CDATA[.
668                         for i := 0; i < 6; i++ {
669                                 if b, ok = d.mustgetc(); !ok {
670                                         return nil, d.err
671                                 }
672                                 if b != "CDATA["[i] {
673                                         d.err = d.syntaxError("invalid <![ sequence")
674                                         return nil, d.err
675                                 }
676                         }
677                         // Have <![CDATA[.  Read text until ]]>.
678                         data := d.text(-1, true)
679                         if data == nil {
680                                 return nil, d.err
681                         }
682                         return CharData(data), nil
683                 }
684
685                 // Probably a directive: <!DOCTYPE ...>, <!ENTITY ...>, etc.
686                 // We don't care, but accumulate for caller. Quoted angle
687                 // brackets do not count for nesting.
688                 d.buf.Reset()
689                 d.buf.WriteByte(b)
690                 inquote := uint8(0)
691                 depth := 0
692                 for {
693                         if b, ok = d.mustgetc(); !ok {
694                                 return nil, d.err
695                         }
696                         if inquote == 0 && b == '>' && depth == 0 {
697                                 break
698                         }
699                 HandleB:
700                         d.buf.WriteByte(b)
701                         switch {
702                         case b == inquote:
703                                 inquote = 0
704
705                         case inquote != 0:
706                                 // in quotes, no special action
707
708                         case b == '\'' || b == '"':
709                                 inquote = b
710
711                         case b == '>' && inquote == 0:
712                                 depth--
713
714                         case b == '<' && inquote == 0:
715                                 // Look for <!-- to begin comment.
716                                 s := "!--"
717                                 for i := 0; i < len(s); i++ {
718                                         if b, ok = d.mustgetc(); !ok {
719                                                 return nil, d.err
720                                         }
721                                         if b != s[i] {
722                                                 for j := 0; j < i; j++ {
723                                                         d.buf.WriteByte(s[j])
724                                                 }
725                                                 depth++
726                                                 goto HandleB
727                                         }
728                                 }
729
730                                 // Remove < that was written above.
731                                 d.buf.Truncate(d.buf.Len() - 1)
732
733                                 // Look for terminator.
734                                 var b0, b1 byte
735                                 for {
736                                         if b, ok = d.mustgetc(); !ok {
737                                                 return nil, d.err
738                                         }
739                                         if b0 == '-' && b1 == '-' && b == '>' {
740                                                 break
741                                         }
742                                         b0, b1 = b1, b
743                                 }
744                         }
745                 }
746                 return Directive(d.buf.Bytes()), nil
747         }
748
749         // Must be an open element like <a href="foo">
750         d.ungetc(b)
751
752         var (
753                 name  Name
754                 empty bool
755                 attr  []Attr
756         )
757         if name, ok = d.nsname(); !ok {
758                 if d.err == nil {
759                         d.err = d.syntaxError("expected element name after <")
760                 }
761                 return nil, d.err
762         }
763
764         attr = []Attr{}
765         for {
766                 d.space()
767                 if b, ok = d.mustgetc(); !ok {
768                         return nil, d.err
769                 }
770                 if b == '/' {
771                         empty = true
772                         if b, ok = d.mustgetc(); !ok {
773                                 return nil, d.err
774                         }
775                         if b != '>' {
776                                 d.err = d.syntaxError("expected /> in element")
777                                 return nil, d.err
778                         }
779                         break
780                 }
781                 if b == '>' {
782                         break
783                 }
784                 d.ungetc(b)
785
786                 n := len(attr)
787                 if n >= cap(attr) {
788                         nCap := 2 * cap(attr)
789                         if nCap == 0 {
790                                 nCap = 4
791                         }
792                         nattr := make([]Attr, n, nCap)
793                         copy(nattr, attr)
794                         attr = nattr
795                 }
796                 attr = attr[0 : n+1]
797                 a := &attr[n]
798                 if a.Name, ok = d.nsname(); !ok {
799                         if d.err == nil {
800                                 d.err = d.syntaxError("expected attribute name in element")
801                         }
802                         return nil, d.err
803                 }
804                 d.space()
805                 if b, ok = d.mustgetc(); !ok {
806                         return nil, d.err
807                 }
808                 if b != '=' {
809                         if d.Strict {
810                                 d.err = d.syntaxError("attribute name without = in element")
811                                 return nil, d.err
812                         } else {
813                                 d.ungetc(b)
814                                 a.Value = a.Name.Local
815                         }
816                 } else {
817                         d.space()
818                         data := d.attrval()
819                         if data == nil {
820                                 return nil, d.err
821                         }
822                         a.Value = string(data)
823                 }
824         }
825         if empty {
826                 d.needClose = true
827                 d.toClose = name
828         }
829         return StartElement{name, attr}, nil
830 }
831
832 func (d *Decoder) attrval() []byte {
833         b, ok := d.mustgetc()
834         if !ok {
835                 return nil
836         }
837         // Handle quoted attribute values
838         if b == '"' || b == '\'' {
839                 return d.text(int(b), false)
840         }
841         // Handle unquoted attribute values for strict parsers
842         if d.Strict {
843                 d.err = d.syntaxError("unquoted or missing attribute value in element")
844                 return nil
845         }
846         // Handle unquoted attribute values for unstrict parsers
847         d.ungetc(b)
848         d.buf.Reset()
849         for {
850                 b, ok = d.mustgetc()
851                 if !ok {
852                         return nil
853                 }
854                 // http://www.w3.org/TR/REC-html40/intro/sgmltut.html#h-3.2.2
855                 if 'a' <= b && b <= 'z' || 'A' <= b && b <= 'Z' ||
856                         '0' <= b && b <= '9' || b == '_' || b == ':' || b == '-' {
857                         d.buf.WriteByte(b)
858                 } else {
859                         d.ungetc(b)
860                         break
861                 }
862         }
863         return d.buf.Bytes()
864 }
865
866 // Skip spaces if any
867 func (d *Decoder) space() {
868         for {
869                 b, ok := d.getc()
870                 if !ok {
871                         return
872                 }
873                 switch b {
874                 case ' ', '\r', '\n', '\t':
875                 default:
876                         d.ungetc(b)
877                         return
878                 }
879         }
880 }
881
882 // Read a single byte.
883 // If there is no byte to read, return ok==false
884 // and leave the error in d.err.
885 // Maintain line number.
886 func (d *Decoder) getc() (b byte, ok bool) {
887         if d.err != nil {
888                 return 0, false
889         }
890         if d.nextByte >= 0 {
891                 b = byte(d.nextByte)
892                 d.nextByte = -1
893         } else {
894                 b, d.err = d.r.ReadByte()
895                 if d.err != nil {
896                         return 0, false
897                 }
898                 if d.saved != nil {
899                         d.saved.WriteByte(b)
900                 }
901         }
902         if b == '\n' {
903                 d.line++
904         }
905         d.offset++
906         return b, true
907 }
908
909 // InputOffset returns the input stream byte offset of the current decoder position.
910 // The offset gives the location of the end of the most recently returned token
911 // and the beginning of the next token.
912 func (d *Decoder) InputOffset() int64 {
913         return d.offset
914 }
915
916 // Return saved offset.
917 // If we did ungetc (nextByte >= 0), have to back up one.
918 func (d *Decoder) savedOffset() int {
919         n := d.saved.Len()
920         if d.nextByte >= 0 {
921                 n--
922         }
923         return n
924 }
925
926 // Must read a single byte.
927 // If there is no byte to read,
928 // set d.err to SyntaxError("unexpected EOF")
929 // and return ok==false
930 func (d *Decoder) mustgetc() (b byte, ok bool) {
931         if b, ok = d.getc(); !ok {
932                 if d.err == io.EOF {
933                         d.err = d.syntaxError("unexpected EOF")
934                 }
935         }
936         return
937 }
938
939 // Unread a single byte.
940 func (d *Decoder) ungetc(b byte) {
941         if b == '\n' {
942                 d.line--
943         }
944         d.nextByte = int(b)
945         d.offset--
946 }
947
948 var entity = map[string]int{
949         "lt":   '<',
950         "gt":   '>',
951         "amp":  '&',
952         "apos": '\'',
953         "quot": '"',
954 }
955
956 // Read plain text section (XML calls it character data).
957 // If quote >= 0, we are in a quoted string and need to find the matching quote.
958 // If cdata == true, we are in a <![CDATA[ section and need to find ]]>.
959 // On failure return nil and leave the error in d.err.
960 func (d *Decoder) text(quote int, cdata bool) []byte {
961         var b0, b1 byte
962         var trunc int
963         d.buf.Reset()
964 Input:
965         for {
966                 b, ok := d.getc()
967                 if !ok {
968                         if cdata {
969                                 if d.err == io.EOF {
970                                         d.err = d.syntaxError("unexpected EOF in CDATA section")
971                                 }
972                                 return nil
973                         }
974                         break Input
975                 }
976
977                 // <![CDATA[ section ends with ]]>.
978                 // It is an error for ]]> to appear in ordinary text.
979                 if b0 == ']' && b1 == ']' && b == '>' {
980                         if cdata {
981                                 trunc = 2
982                                 break Input
983                         }
984                         d.err = d.syntaxError("unescaped ]]> not in CDATA section")
985                         return nil
986                 }
987
988                 // Stop reading text if we see a <.
989                 if b == '<' && !cdata {
990                         if quote >= 0 {
991                                 d.err = d.syntaxError("unescaped < inside quoted string")
992                                 return nil
993                         }
994                         d.ungetc('<')
995                         break Input
996                 }
997                 if quote >= 0 && b == byte(quote) {
998                         break Input
999                 }
1000                 if b == '&' && !cdata {
1001                         // Read escaped character expression up to semicolon.
1002                         // XML in all its glory allows a document to define and use
1003                         // its own character names with <!ENTITY ...> directives.
1004                         // Parsers are required to recognize lt, gt, amp, apos, and quot
1005                         // even if they have not been declared.
1006                         before := d.buf.Len()
1007                         d.buf.WriteByte('&')
1008                         var ok bool
1009                         var text string
1010                         var haveText bool
1011                         if b, ok = d.mustgetc(); !ok {
1012                                 return nil
1013                         }
1014                         if b == '#' {
1015                                 d.buf.WriteByte(b)
1016                                 if b, ok = d.mustgetc(); !ok {
1017                                         return nil
1018                                 }
1019                                 base := 10
1020                                 if b == 'x' {
1021                                         base = 16
1022                                         d.buf.WriteByte(b)
1023                                         if b, ok = d.mustgetc(); !ok {
1024                                                 return nil
1025                                         }
1026                                 }
1027                                 start := d.buf.Len()
1028                                 for '0' <= b && b <= '9' ||
1029                                         base == 16 && 'a' <= b && b <= 'f' ||
1030                                         base == 16 && 'A' <= b && b <= 'F' {
1031                                         d.buf.WriteByte(b)
1032                                         if b, ok = d.mustgetc(); !ok {
1033                                                 return nil
1034                                         }
1035                                 }
1036                                 if b != ';' {
1037                                         d.ungetc(b)
1038                                 } else {
1039                                         s := string(d.buf.Bytes()[start:])
1040                                         d.buf.WriteByte(';')
1041                                         n, err := strconv.ParseUint(s, base, 64)
1042                                         if err == nil && n <= unicode.MaxRune {
1043                                                 text = string(n)
1044                                                 haveText = true
1045                                         }
1046                                 }
1047                         } else {
1048                                 d.ungetc(b)
1049                                 if !d.readName() {
1050                                         if d.err != nil {
1051                                                 return nil
1052                                         }
1053                                         ok = false
1054                                 }
1055                                 if b, ok = d.mustgetc(); !ok {
1056                                         return nil
1057                                 }
1058                                 if b != ';' {
1059                                         d.ungetc(b)
1060                                 } else {
1061                                         name := d.buf.Bytes()[before+1:]
1062                                         d.buf.WriteByte(';')
1063                                         if isName(name) {
1064                                                 s := string(name)
1065                                                 if r, ok := entity[s]; ok {
1066                                                         text = string(r)
1067                                                         haveText = true
1068                                                 } else if d.Entity != nil {
1069                                                         text, haveText = d.Entity[s]
1070                                                 }
1071                                         }
1072                                 }
1073                         }
1074
1075                         if haveText {
1076                                 d.buf.Truncate(before)
1077                                 d.buf.Write([]byte(text))
1078                                 b0, b1 = 0, 0
1079                                 continue Input
1080                         }
1081                         if !d.Strict {
1082                                 b0, b1 = 0, 0
1083                                 continue Input
1084                         }
1085                         ent := string(d.buf.Bytes()[before:])
1086                         if ent[len(ent)-1] != ';' {
1087                                 ent += " (no semicolon)"
1088                         }
1089                         d.err = d.syntaxError("invalid character entity " + ent)
1090                         return nil
1091                 }
1092
1093                 // We must rewrite unescaped \r and \r\n into \n.
1094                 if b == '\r' {
1095                         d.buf.WriteByte('\n')
1096                 } else if b1 == '\r' && b == '\n' {
1097                         // Skip \r\n--we already wrote \n.
1098                 } else {
1099                         d.buf.WriteByte(b)
1100                 }
1101
1102                 b0, b1 = b1, b
1103         }
1104         data := d.buf.Bytes()
1105         data = data[0 : len(data)-trunc]
1106
1107         // Inspect each rune for being a disallowed character.
1108         buf := data
1109         for len(buf) > 0 {
1110                 r, size := utf8.DecodeRune(buf)
1111                 if r == utf8.RuneError && size == 1 {
1112                         d.err = d.syntaxError("invalid UTF-8")
1113                         return nil
1114                 }
1115                 buf = buf[size:]
1116                 if !isInCharacterRange(r) {
1117                         d.err = d.syntaxError(fmt.Sprintf("illegal character code %U", r))
1118                         return nil
1119                 }
1120         }
1121
1122         return data
1123 }
1124
1125 // Decide whether the given rune is in the XML Character Range, per
1126 // the Char production of http://www.xml.com/axml/testaxml.htm,
1127 // Section 2.2 Characters.
1128 func isInCharacterRange(r rune) (inrange bool) {
1129         return r == 0x09 ||
1130                 r == 0x0A ||
1131                 r == 0x0D ||
1132                 r >= 0x20 && r <= 0xDF77 ||
1133                 r >= 0xE000 && r <= 0xFFFD ||
1134                 r >= 0x10000 && r <= 0x10FFFF
1135 }
1136
1137 // Get name space name: name with a : stuck in the middle.
1138 // The part before the : is the name space identifier.
1139 func (d *Decoder) nsname() (name Name, ok bool) {
1140         s, ok := d.name()
1141         if !ok {
1142                 return
1143         }
1144         i := strings.Index(s, ":")
1145         if i < 0 {
1146                 name.Local = s
1147         } else {
1148                 name.Space = s[0:i]
1149                 name.Local = s[i+1:]
1150         }
1151         return name, true
1152 }
1153
1154 // Get name: /first(first|second)*/
1155 // Do not set d.err if the name is missing (unless unexpected EOF is received):
1156 // let the caller provide better context.
1157 func (d *Decoder) name() (s string, ok bool) {
1158         d.buf.Reset()
1159         if !d.readName() {
1160                 return "", false
1161         }
1162
1163         // Now we check the characters.
1164         b := d.buf.Bytes()
1165         if !isName(b) {
1166                 d.err = d.syntaxError("invalid XML name: " + string(b))
1167                 return "", false
1168         }
1169         return string(b), true
1170 }
1171
1172 // Read a name and append its bytes to d.buf.
1173 // The name is delimited by any single-byte character not valid in names.
1174 // All multi-byte characters are accepted; the caller must check their validity.
1175 func (d *Decoder) readName() (ok bool) {
1176         var b byte
1177         if b, ok = d.mustgetc(); !ok {
1178                 return
1179         }
1180         if b < utf8.RuneSelf && !isNameByte(b) {
1181                 d.ungetc(b)
1182                 return false
1183         }
1184         d.buf.WriteByte(b)
1185
1186         for {
1187                 if b, ok = d.mustgetc(); !ok {
1188                         return
1189                 }
1190                 if b < utf8.RuneSelf && !isNameByte(b) {
1191                         d.ungetc(b)
1192                         break
1193                 }
1194                 d.buf.WriteByte(b)
1195         }
1196         return true
1197 }
1198
1199 func isNameByte(c byte) bool {
1200         return 'A' <= c && c <= 'Z' ||
1201                 'a' <= c && c <= 'z' ||
1202                 '0' <= c && c <= '9' ||
1203                 c == '_' || c == ':' || c == '.' || c == '-'
1204 }
1205
1206 func isName(s []byte) bool {
1207         if len(s) == 0 {
1208                 return false
1209         }
1210         c, n := utf8.DecodeRune(s)
1211         if c == utf8.RuneError && n == 1 {
1212                 return false
1213         }
1214         if !unicode.Is(first, c) {
1215                 return false
1216         }
1217         for n < len(s) {
1218                 s = s[n:]
1219                 c, n = utf8.DecodeRune(s)
1220                 if c == utf8.RuneError && n == 1 {
1221                         return false
1222                 }
1223                 if !unicode.Is(first, c) && !unicode.Is(second, c) {
1224                         return false
1225                 }
1226         }
1227         return true
1228 }
1229
1230 func isNameString(s string) bool {
1231         if len(s) == 0 {
1232                 return false
1233         }
1234         c, n := utf8.DecodeRuneInString(s)
1235         if c == utf8.RuneError && n == 1 {
1236                 return false
1237         }
1238         if !unicode.Is(first, c) {
1239                 return false
1240         }
1241         for n < len(s) {
1242                 s = s[n:]
1243                 c, n = utf8.DecodeRuneInString(s)
1244                 if c == utf8.RuneError && n == 1 {
1245                         return false
1246                 }
1247                 if !unicode.Is(first, c) && !unicode.Is(second, c) {
1248                         return false
1249                 }
1250         }
1251         return true
1252 }
1253
1254 // These tables were generated by cut and paste from Appendix B of
1255 // the XML spec at http://www.xml.com/axml/testaxml.htm
1256 // and then reformatting. First corresponds to (Letter | '_' | ':')
1257 // and second corresponds to NameChar.
1258
1259 var first = &unicode.RangeTable{
1260         R16: []unicode.Range16{
1261                 {0x003A, 0x003A, 1},
1262                 {0x0041, 0x005A, 1},
1263                 {0x005F, 0x005F, 1},
1264                 {0x0061, 0x007A, 1},
1265                 {0x00C0, 0x00D6, 1},
1266                 {0x00D8, 0x00F6, 1},
1267                 {0x00F8, 0x00FF, 1},
1268                 {0x0100, 0x0131, 1},
1269                 {0x0134, 0x013E, 1},
1270                 {0x0141, 0x0148, 1},
1271                 {0x014A, 0x017E, 1},
1272                 {0x0180, 0x01C3, 1},
1273                 {0x01CD, 0x01F0, 1},
1274                 {0x01F4, 0x01F5, 1},
1275                 {0x01FA, 0x0217, 1},
1276                 {0x0250, 0x02A8, 1},
1277                 {0x02BB, 0x02C1, 1},
1278                 {0x0386, 0x0386, 1},
1279                 {0x0388, 0x038A, 1},
1280                 {0x038C, 0x038C, 1},
1281                 {0x038E, 0x03A1, 1},
1282                 {0x03A3, 0x03CE, 1},
1283                 {0x03D0, 0x03D6, 1},
1284                 {0x03DA, 0x03E0, 2},
1285                 {0x03E2, 0x03F3, 1},
1286                 {0x0401, 0x040C, 1},
1287                 {0x040E, 0x044F, 1},
1288                 {0x0451, 0x045C, 1},
1289                 {0x045E, 0x0481, 1},
1290                 {0x0490, 0x04C4, 1},
1291                 {0x04C7, 0x04C8, 1},
1292                 {0x04CB, 0x04CC, 1},
1293                 {0x04D0, 0x04EB, 1},
1294                 {0x04EE, 0x04F5, 1},
1295                 {0x04F8, 0x04F9, 1},
1296                 {0x0531, 0x0556, 1},
1297                 {0x0559, 0x0559, 1},
1298                 {0x0561, 0x0586, 1},
1299                 {0x05D0, 0x05EA, 1},
1300                 {0x05F0, 0x05F2, 1},
1301                 {0x0621, 0x063A, 1},
1302                 {0x0641, 0x064A, 1},
1303                 {0x0671, 0x06B7, 1},
1304                 {0x06BA, 0x06BE, 1},
1305                 {0x06C0, 0x06CE, 1},
1306                 {0x06D0, 0x06D3, 1},
1307                 {0x06D5, 0x06D5, 1},
1308                 {0x06E5, 0x06E6, 1},
1309                 {0x0905, 0x0939, 1},
1310                 {0x093D, 0x093D, 1},
1311                 {0x0958, 0x0961, 1},
1312                 {0x0985, 0x098C, 1},
1313                 {0x098F, 0x0990, 1},
1314                 {0x0993, 0x09A8, 1},
1315                 {0x09AA, 0x09B0, 1},
1316                 {0x09B2, 0x09B2, 1},
1317                 {0x09B6, 0x09B9, 1},
1318                 {0x09DC, 0x09DD, 1},
1319                 {0x09DF, 0x09E1, 1},
1320                 {0x09F0, 0x09F1, 1},
1321                 {0x0A05, 0x0A0A, 1},
1322                 {0x0A0F, 0x0A10, 1},
1323                 {0x0A13, 0x0A28, 1},
1324                 {0x0A2A, 0x0A30, 1},
1325                 {0x0A32, 0x0A33, 1},
1326                 {0x0A35, 0x0A36, 1},
1327                 {0x0A38, 0x0A39, 1},
1328                 {0x0A59, 0x0A5C, 1},
1329                 {0x0A5E, 0x0A5E, 1},
1330                 {0x0A72, 0x0A74, 1},
1331                 {0x0A85, 0x0A8B, 1},
1332                 {0x0A8D, 0x0A8D, 1},
1333                 {0x0A8F, 0x0A91, 1},
1334                 {0x0A93, 0x0AA8, 1},
1335                 {0x0AAA, 0x0AB0, 1},
1336                 {0x0AB2, 0x0AB3, 1},
1337                 {0x0AB5, 0x0AB9, 1},
1338                 {0x0ABD, 0x0AE0, 0x23},
1339                 {0x0B05, 0x0B0C, 1},
1340                 {0x0B0F, 0x0B10, 1},
1341                 {0x0B13, 0x0B28, 1},
1342                 {0x0B2A, 0x0B30, 1},
1343                 {0x0B32, 0x0B33, 1},
1344                 {0x0B36, 0x0B39, 1},
1345                 {0x0B3D, 0x0B3D, 1},
1346                 {0x0B5C, 0x0B5D, 1},
1347                 {0x0B5F, 0x0B61, 1},
1348                 {0x0B85, 0x0B8A, 1},
1349                 {0x0B8E, 0x0B90, 1},
1350                 {0x0B92, 0x0B95, 1},
1351                 {0x0B99, 0x0B9A, 1},
1352                 {0x0B9C, 0x0B9C, 1},
1353                 {0x0B9E, 0x0B9F, 1},
1354                 {0x0BA3, 0x0BA4, 1},
1355                 {0x0BA8, 0x0BAA, 1},
1356                 {0x0BAE, 0x0BB5, 1},
1357                 {0x0BB7, 0x0BB9, 1},
1358                 {0x0C05, 0x0C0C, 1},
1359                 {0x0C0E, 0x0C10, 1},
1360                 {0x0C12, 0x0C28, 1},
1361                 {0x0C2A, 0x0C33, 1},
1362                 {0x0C35, 0x0C39, 1},
1363                 {0x0C60, 0x0C61, 1},
1364                 {0x0C85, 0x0C8C, 1},
1365                 {0x0C8E, 0x0C90, 1},
1366                 {0x0C92, 0x0CA8, 1},
1367                 {0x0CAA, 0x0CB3, 1},
1368                 {0x0CB5, 0x0CB9, 1},
1369                 {0x0CDE, 0x0CDE, 1},
1370                 {0x0CE0, 0x0CE1, 1},
1371                 {0x0D05, 0x0D0C, 1},
1372                 {0x0D0E, 0x0D10, 1},
1373                 {0x0D12, 0x0D28, 1},
1374                 {0x0D2A, 0x0D39, 1},
1375                 {0x0D60, 0x0D61, 1},
1376                 {0x0E01, 0x0E2E, 1},
1377                 {0x0E30, 0x0E30, 1},
1378                 {0x0E32, 0x0E33, 1},
1379                 {0x0E40, 0x0E45, 1},
1380                 {0x0E81, 0x0E82, 1},
1381                 {0x0E84, 0x0E84, 1},
1382                 {0x0E87, 0x0E88, 1},
1383                 {0x0E8A, 0x0E8D, 3},
1384                 {0x0E94, 0x0E97, 1},
1385                 {0x0E99, 0x0E9F, 1},
1386                 {0x0EA1, 0x0EA3, 1},
1387                 {0x0EA5, 0x0EA7, 2},
1388                 {0x0EAA, 0x0EAB, 1},
1389                 {0x0EAD, 0x0EAE, 1},
1390                 {0x0EB0, 0x0EB0, 1},
1391                 {0x0EB2, 0x0EB3, 1},
1392                 {0x0EBD, 0x0EBD, 1},
1393                 {0x0EC0, 0x0EC4, 1},
1394                 {0x0F40, 0x0F47, 1},
1395                 {0x0F49, 0x0F69, 1},
1396                 {0x10A0, 0x10C5, 1},
1397                 {0x10D0, 0x10F6, 1},
1398                 {0x1100, 0x1100, 1},
1399                 {0x1102, 0x1103, 1},
1400                 {0x1105, 0x1107, 1},
1401                 {0x1109, 0x1109, 1},
1402                 {0x110B, 0x110C, 1},
1403                 {0x110E, 0x1112, 1},
1404                 {0x113C, 0x1140, 2},
1405                 {0x114C, 0x1150, 2},
1406                 {0x1154, 0x1155, 1},
1407                 {0x1159, 0x1159, 1},
1408                 {0x115F, 0x1161, 1},
1409                 {0x1163, 0x1169, 2},
1410                 {0x116D, 0x116E, 1},
1411                 {0x1172, 0x1173, 1},
1412                 {0x1175, 0x119E, 0x119E - 0x1175},
1413                 {0x11A8, 0x11AB, 0x11AB - 0x11A8},
1414                 {0x11AE, 0x11AF, 1},
1415                 {0x11B7, 0x11B8, 1},
1416                 {0x11BA, 0x11BA, 1},
1417                 {0x11BC, 0x11C2, 1},
1418                 {0x11EB, 0x11F0, 0x11F0 - 0x11EB},
1419                 {0x11F9, 0x11F9, 1},
1420                 {0x1E00, 0x1E9B, 1},
1421                 {0x1EA0, 0x1EF9, 1},
1422                 {0x1F00, 0x1F15, 1},
1423                 {0x1F18, 0x1F1D, 1},
1424                 {0x1F20, 0x1F45, 1},
1425                 {0x1F48, 0x1F4D, 1},
1426                 {0x1F50, 0x1F57, 1},
1427                 {0x1F59, 0x1F5B, 0x1F5B - 0x1F59},
1428                 {0x1F5D, 0x1F5D, 1},
1429                 {0x1F5F, 0x1F7D, 1},
1430                 {0x1F80, 0x1FB4, 1},
1431                 {0x1FB6, 0x1FBC, 1},
1432                 {0x1FBE, 0x1FBE, 1},
1433                 {0x1FC2, 0x1FC4, 1},
1434                 {0x1FC6, 0x1FCC, 1},
1435                 {0x1FD0, 0x1FD3, 1},
1436                 {0x1FD6, 0x1FDB, 1},
1437                 {0x1FE0, 0x1FEC, 1},
1438                 {0x1FF2, 0x1FF4, 1},
1439                 {0x1FF6, 0x1FFC, 1},
1440                 {0x2126, 0x2126, 1},
1441                 {0x212A, 0x212B, 1},
1442                 {0x212E, 0x212E, 1},
1443                 {0x2180, 0x2182, 1},
1444                 {0x3007, 0x3007, 1},
1445                 {0x3021, 0x3029, 1},
1446                 {0x3041, 0x3094, 1},
1447                 {0x30A1, 0x30FA, 1},
1448                 {0x3105, 0x312C, 1},
1449                 {0x4E00, 0x9FA5, 1},
1450                 {0xAC00, 0xD7A3, 1},
1451         },
1452 }
1453
1454 var second = &unicode.RangeTable{
1455         R16: []unicode.Range16{
1456                 {0x002D, 0x002E, 1},
1457                 {0x0030, 0x0039, 1},
1458                 {0x00B7, 0x00B7, 1},
1459                 {0x02D0, 0x02D1, 1},
1460                 {0x0300, 0x0345, 1},
1461                 {0x0360, 0x0361, 1},
1462                 {0x0387, 0x0387, 1},
1463                 {0x0483, 0x0486, 1},
1464                 {0x0591, 0x05A1, 1},
1465                 {0x05A3, 0x05B9, 1},
1466                 {0x05BB, 0x05BD, 1},
1467                 {0x05BF, 0x05BF, 1},
1468                 {0x05C1, 0x05C2, 1},
1469                 {0x05C4, 0x0640, 0x0640 - 0x05C4},
1470                 {0x064B, 0x0652, 1},
1471                 {0x0660, 0x0669, 1},
1472                 {0x0670, 0x0670, 1},
1473                 {0x06D6, 0x06DC, 1},
1474                 {0x06DD, 0x06DF, 1},
1475                 {0x06E0, 0x06E4, 1},
1476                 {0x06E7, 0x06E8, 1},
1477                 {0x06EA, 0x06ED, 1},
1478                 {0x06F0, 0x06F9, 1},
1479                 {0x0901, 0x0903, 1},
1480                 {0x093C, 0x093C, 1},
1481                 {0x093E, 0x094C, 1},
1482                 {0x094D, 0x094D, 1},
1483                 {0x0951, 0x0954, 1},
1484                 {0x0962, 0x0963, 1},
1485                 {0x0966, 0x096F, 1},
1486                 {0x0981, 0x0983, 1},
1487                 {0x09BC, 0x09BC, 1},
1488                 {0x09BE, 0x09BF, 1},
1489                 {0x09C0, 0x09C4, 1},
1490                 {0x09C7, 0x09C8, 1},
1491                 {0x09CB, 0x09CD, 1},
1492                 {0x09D7, 0x09D7, 1},
1493                 {0x09E2, 0x09E3, 1},
1494                 {0x09E6, 0x09EF, 1},
1495                 {0x0A02, 0x0A3C, 0x3A},
1496                 {0x0A3E, 0x0A3F, 1},
1497                 {0x0A40, 0x0A42, 1},
1498                 {0x0A47, 0x0A48, 1},
1499                 {0x0A4B, 0x0A4D, 1},
1500                 {0x0A66, 0x0A6F, 1},
1501                 {0x0A70, 0x0A71, 1},
1502                 {0x0A81, 0x0A83, 1},
1503                 {0x0ABC, 0x0ABC, 1},
1504                 {0x0ABE, 0x0AC5, 1},
1505                 {0x0AC7, 0x0AC9, 1},
1506                 {0x0ACB, 0x0ACD, 1},
1507                 {0x0AE6, 0x0AEF, 1},
1508                 {0x0B01, 0x0B03, 1},
1509                 {0x0B3C, 0x0B3C, 1},
1510                 {0x0B3E, 0x0B43, 1},
1511                 {0x0B47, 0x0B48, 1},
1512                 {0x0B4B, 0x0B4D, 1},
1513                 {0x0B56, 0x0B57, 1},
1514                 {0x0B66, 0x0B6F, 1},
1515                 {0x0B82, 0x0B83, 1},
1516                 {0x0BBE, 0x0BC2, 1},
1517                 {0x0BC6, 0x0BC8, 1},
1518                 {0x0BCA, 0x0BCD, 1},
1519                 {0x0BD7, 0x0BD7, 1},
1520                 {0x0BE7, 0x0BEF, 1},
1521                 {0x0C01, 0x0C03, 1},
1522                 {0x0C3E, 0x0C44, 1},
1523                 {0x0C46, 0x0C48, 1},
1524                 {0x0C4A, 0x0C4D, 1},
1525                 {0x0C55, 0x0C56, 1},
1526                 {0x0C66, 0x0C6F, 1},
1527                 {0x0C82, 0x0C83, 1},
1528                 {0x0CBE, 0x0CC4, 1},
1529                 {0x0CC6, 0x0CC8, 1},
1530                 {0x0CCA, 0x0CCD, 1},
1531                 {0x0CD5, 0x0CD6, 1},
1532                 {0x0CE6, 0x0CEF, 1},
1533                 {0x0D02, 0x0D03, 1},
1534                 {0x0D3E, 0x0D43, 1},
1535                 {0x0D46, 0x0D48, 1},
1536                 {0x0D4A, 0x0D4D, 1},
1537                 {0x0D57, 0x0D57, 1},
1538                 {0x0D66, 0x0D6F, 1},
1539                 {0x0E31, 0x0E31, 1},
1540                 {0x0E34, 0x0E3A, 1},
1541                 {0x0E46, 0x0E46, 1},
1542                 {0x0E47, 0x0E4E, 1},
1543                 {0x0E50, 0x0E59, 1},
1544                 {0x0EB1, 0x0EB1, 1},
1545                 {0x0EB4, 0x0EB9, 1},
1546                 {0x0EBB, 0x0EBC, 1},
1547                 {0x0EC6, 0x0EC6, 1},
1548                 {0x0EC8, 0x0ECD, 1},
1549                 {0x0ED0, 0x0ED9, 1},
1550                 {0x0F18, 0x0F19, 1},
1551                 {0x0F20, 0x0F29, 1},
1552                 {0x0F35, 0x0F39, 2},
1553                 {0x0F3E, 0x0F3F, 1},
1554                 {0x0F71, 0x0F84, 1},
1555                 {0x0F86, 0x0F8B, 1},
1556                 {0x0F90, 0x0F95, 1},
1557                 {0x0F97, 0x0F97, 1},
1558                 {0x0F99, 0x0FAD, 1},
1559                 {0x0FB1, 0x0FB7, 1},
1560                 {0x0FB9, 0x0FB9, 1},
1561                 {0x20D0, 0x20DC, 1},
1562                 {0x20E1, 0x3005, 0x3005 - 0x20E1},
1563                 {0x302A, 0x302F, 1},
1564                 {0x3031, 0x3035, 1},
1565                 {0x3099, 0x309A, 1},
1566                 {0x309D, 0x309E, 1},
1567                 {0x30FC, 0x30FE, 1},
1568         },
1569 }
1570
1571 // HTMLEntity is an entity map containing translations for the
1572 // standard HTML entity characters.
1573 var HTMLEntity = htmlEntity
1574
1575 var htmlEntity = map[string]string{
1576         /*
1577                 hget http://www.w3.org/TR/html4/sgml/entities.html |
1578                 ssam '
1579                         ,y /\&gt;/ x/\&lt;(.|\n)+/ s/\n/ /g
1580                         ,x v/^\&lt;!ENTITY/d
1581                         ,s/\&lt;!ENTITY ([^ ]+) .*U\+([0-9A-F][0-9A-F][0-9A-F][0-9A-F]) .+/     "\1": "\\u\2",/g
1582                 '
1583         */
1584         "nbsp":     "\u00A0",
1585         "iexcl":    "\u00A1",
1586         "cent":     "\u00A2",
1587         "pound":    "\u00A3",
1588         "curren":   "\u00A4",
1589         "yen":      "\u00A5",
1590         "brvbar":   "\u00A6",
1591         "sect":     "\u00A7",
1592         "uml":      "\u00A8",
1593         "copy":     "\u00A9",
1594         "ordf":     "\u00AA",
1595         "laquo":    "\u00AB",
1596         "not":      "\u00AC",
1597         "shy":      "\u00AD",
1598         "reg":      "\u00AE",
1599         "macr":     "\u00AF",
1600         "deg":      "\u00B0",
1601         "plusmn":   "\u00B1",
1602         "sup2":     "\u00B2",
1603         "sup3":     "\u00B3",
1604         "acute":    "\u00B4",
1605         "micro":    "\u00B5",
1606         "para":     "\u00B6",
1607         "middot":   "\u00B7",
1608         "cedil":    "\u00B8",
1609         "sup1":     "\u00B9",
1610         "ordm":     "\u00BA",
1611         "raquo":    "\u00BB",
1612         "frac14":   "\u00BC",
1613         "frac12":   "\u00BD",
1614         "frac34":   "\u00BE",
1615         "iquest":   "\u00BF",
1616         "Agrave":   "\u00C0",
1617         "Aacute":   "\u00C1",
1618         "Acirc":    "\u00C2",
1619         "Atilde":   "\u00C3",
1620         "Auml":     "\u00C4",
1621         "Aring":    "\u00C5",
1622         "AElig":    "\u00C6",
1623         "Ccedil":   "\u00C7",
1624         "Egrave":   "\u00C8",
1625         "Eacute":   "\u00C9",
1626         "Ecirc":    "\u00CA",
1627         "Euml":     "\u00CB",
1628         "Igrave":   "\u00CC",
1629         "Iacute":   "\u00CD",
1630         "Icirc":    "\u00CE",
1631         "Iuml":     "\u00CF",
1632         "ETH":      "\u00D0",
1633         "Ntilde":   "\u00D1",
1634         "Ograve":   "\u00D2",
1635         "Oacute":   "\u00D3",
1636         "Ocirc":    "\u00D4",
1637         "Otilde":   "\u00D5",
1638         "Ouml":     "\u00D6",
1639         "times":    "\u00D7",
1640         "Oslash":   "\u00D8",
1641         "Ugrave":   "\u00D9",
1642         "Uacute":   "\u00DA",
1643         "Ucirc":    "\u00DB",
1644         "Uuml":     "\u00DC",
1645         "Yacute":   "\u00DD",
1646         "THORN":    "\u00DE",
1647         "szlig":    "\u00DF",
1648         "agrave":   "\u00E0",
1649         "aacute":   "\u00E1",
1650         "acirc":    "\u00E2",
1651         "atilde":   "\u00E3",
1652         "auml":     "\u00E4",
1653         "aring":    "\u00E5",
1654         "aelig":    "\u00E6",
1655         "ccedil":   "\u00E7",
1656         "egrave":   "\u00E8",
1657         "eacute":   "\u00E9",
1658         "ecirc":    "\u00EA",
1659         "euml":     "\u00EB",
1660         "igrave":   "\u00EC",
1661         "iacute":   "\u00ED",
1662         "icirc":    "\u00EE",
1663         "iuml":     "\u00EF",
1664         "eth":      "\u00F0",
1665         "ntilde":   "\u00F1",
1666         "ograve":   "\u00F2",
1667         "oacute":   "\u00F3",
1668         "ocirc":    "\u00F4",
1669         "otilde":   "\u00F5",
1670         "ouml":     "\u00F6",
1671         "divide":   "\u00F7",
1672         "oslash":   "\u00F8",
1673         "ugrave":   "\u00F9",
1674         "uacute":   "\u00FA",
1675         "ucirc":    "\u00FB",
1676         "uuml":     "\u00FC",
1677         "yacute":   "\u00FD",
1678         "thorn":    "\u00FE",
1679         "yuml":     "\u00FF",
1680         "fnof":     "\u0192",
1681         "Alpha":    "\u0391",
1682         "Beta":     "\u0392",
1683         "Gamma":    "\u0393",
1684         "Delta":    "\u0394",
1685         "Epsilon":  "\u0395",
1686         "Zeta":     "\u0396",
1687         "Eta":      "\u0397",
1688         "Theta":    "\u0398",
1689         "Iota":     "\u0399",
1690         "Kappa":    "\u039A",
1691         "Lambda":   "\u039B",
1692         "Mu":       "\u039C",
1693         "Nu":       "\u039D",
1694         "Xi":       "\u039E",
1695         "Omicron":  "\u039F",
1696         "Pi":       "\u03A0",
1697         "Rho":      "\u03A1",
1698         "Sigma":    "\u03A3",
1699         "Tau":      "\u03A4",
1700         "Upsilon":  "\u03A5",
1701         "Phi":      "\u03A6",
1702         "Chi":      "\u03A7",
1703         "Psi":      "\u03A8",
1704         "Omega":    "\u03A9",
1705         "alpha":    "\u03B1",
1706         "beta":     "\u03B2",
1707         "gamma":    "\u03B3",
1708         "delta":    "\u03B4",
1709         "epsilon":  "\u03B5",
1710         "zeta":     "\u03B6",
1711         "eta":      "\u03B7",
1712         "theta":    "\u03B8",
1713         "iota":     "\u03B9",
1714         "kappa":    "\u03BA",
1715         "lambda":   "\u03BB",
1716         "mu":       "\u03BC",
1717         "nu":       "\u03BD",
1718         "xi":       "\u03BE",
1719         "omicron":  "\u03BF",
1720         "pi":       "\u03C0",
1721         "rho":      "\u03C1",
1722         "sigmaf":   "\u03C2",
1723         "sigma":    "\u03C3",
1724         "tau":      "\u03C4",
1725         "upsilon":  "\u03C5",
1726         "phi":      "\u03C6",
1727         "chi":      "\u03C7",
1728         "psi":      "\u03C8",
1729         "omega":    "\u03C9",
1730         "thetasym": "\u03D1",
1731         "upsih":    "\u03D2",
1732         "piv":      "\u03D6",
1733         "bull":     "\u2022",
1734         "hellip":   "\u2026",
1735         "prime":    "\u2032",
1736         "Prime":    "\u2033",
1737         "oline":    "\u203E",
1738         "frasl":    "\u2044",
1739         "weierp":   "\u2118",
1740         "image":    "\u2111",
1741         "real":     "\u211C",
1742         "trade":    "\u2122",
1743         "alefsym":  "\u2135",
1744         "larr":     "\u2190",
1745         "uarr":     "\u2191",
1746         "rarr":     "\u2192",
1747         "darr":     "\u2193",
1748         "harr":     "\u2194",
1749         "crarr":    "\u21B5",
1750         "lArr":     "\u21D0",
1751         "uArr":     "\u21D1",
1752         "rArr":     "\u21D2",
1753         "dArr":     "\u21D3",
1754         "hArr":     "\u21D4",
1755         "forall":   "\u2200",
1756         "part":     "\u2202",
1757         "exist":    "\u2203",
1758         "empty":    "\u2205",
1759         "nabla":    "\u2207",
1760         "isin":     "\u2208",
1761         "notin":    "\u2209",
1762         "ni":       "\u220B",
1763         "prod":     "\u220F",
1764         "sum":      "\u2211",
1765         "minus":    "\u2212",
1766         "lowast":   "\u2217",
1767         "radic":    "\u221A",
1768         "prop":     "\u221D",
1769         "infin":    "\u221E",
1770         "ang":      "\u2220",
1771         "and":      "\u2227",
1772         "or":       "\u2228",
1773         "cap":      "\u2229",
1774         "cup":      "\u222A",
1775         "int":      "\u222B",
1776         "there4":   "\u2234",
1777         "sim":      "\u223C",
1778         "cong":     "\u2245",
1779         "asymp":    "\u2248",
1780         "ne":       "\u2260",
1781         "equiv":    "\u2261",
1782         "le":       "\u2264",
1783         "ge":       "\u2265",
1784         "sub":      "\u2282",
1785         "sup":      "\u2283",
1786         "nsub":     "\u2284",
1787         "sube":     "\u2286",
1788         "supe":     "\u2287",
1789         "oplus":    "\u2295",
1790         "otimes":   "\u2297",
1791         "perp":     "\u22A5",
1792         "sdot":     "\u22C5",
1793         "lceil":    "\u2308",
1794         "rceil":    "\u2309",
1795         "lfloor":   "\u230A",
1796         "rfloor":   "\u230B",
1797         "lang":     "\u2329",
1798         "rang":     "\u232A",
1799         "loz":      "\u25CA",
1800         "spades":   "\u2660",
1801         "clubs":    "\u2663",
1802         "hearts":   "\u2665",
1803         "diams":    "\u2666",
1804         "quot":     "\u0022",
1805         "amp":      "\u0026",
1806         "lt":       "\u003C",
1807         "gt":       "\u003E",
1808         "OElig":    "\u0152",
1809         "oelig":    "\u0153",
1810         "Scaron":   "\u0160",
1811         "scaron":   "\u0161",
1812         "Yuml":     "\u0178",
1813         "circ":     "\u02C6",
1814         "tilde":    "\u02DC",
1815         "ensp":     "\u2002",
1816         "emsp":     "\u2003",
1817         "thinsp":   "\u2009",
1818         "zwnj":     "\u200C",
1819         "zwj":      "\u200D",
1820         "lrm":      "\u200E",
1821         "rlm":      "\u200F",
1822         "ndash":    "\u2013",
1823         "mdash":    "\u2014",
1824         "lsquo":    "\u2018",
1825         "rsquo":    "\u2019",
1826         "sbquo":    "\u201A",
1827         "ldquo":    "\u201C",
1828         "rdquo":    "\u201D",
1829         "bdquo":    "\u201E",
1830         "dagger":   "\u2020",
1831         "Dagger":   "\u2021",
1832         "permil":   "\u2030",
1833         "lsaquo":   "\u2039",
1834         "rsaquo":   "\u203A",
1835         "euro":     "\u20AC",
1836 }
1837
1838 // HTMLAutoClose is the set of HTML elements that
1839 // should be considered to close automatically.
1840 var HTMLAutoClose = htmlAutoClose
1841
1842 var htmlAutoClose = []string{
1843         /*
1844                 hget http://www.w3.org/TR/html4/loose.dtd |
1845                 9 sed -n 's/<!ELEMENT ([^ ]*) +- O EMPTY.+/     "\1",/p' | tr A-Z a-z
1846         */
1847         "basefont",
1848         "br",
1849         "area",
1850         "link",
1851         "img",
1852         "param",
1853         "hr",
1854         "input",
1855         "col",
1856         "frame",
1857         "isindex",
1858         "base",
1859         "meta",
1860 }
1861
1862 var (
1863         esc_quot = []byte("&#34;") // shorter than "&quot;"
1864         esc_apos = []byte("&#39;") // shorter than "&apos;"
1865         esc_amp  = []byte("&amp;")
1866         esc_lt   = []byte("&lt;")
1867         esc_gt   = []byte("&gt;")
1868         esc_tab  = []byte("&#x9;")
1869         esc_nl   = []byte("&#xA;")
1870         esc_cr   = []byte("&#xD;")
1871         esc_fffd = []byte("\uFFFD") // Unicode replacement character
1872 )
1873
1874 // EscapeText writes to w the properly escaped XML equivalent
1875 // of the plain text data s.
1876 func EscapeText(w io.Writer, s []byte) error {
1877         return escapeText(w, s, true)
1878 }
1879
1880 // escapeText writes to w the properly escaped XML equivalent
1881 // of the plain text data s. If escapeNewline is true, newline
1882 // characters will be escaped.
1883 func escapeText(w io.Writer, s []byte, escapeNewline bool) error {
1884         var esc []byte
1885         last := 0
1886         for i := 0; i < len(s); {
1887                 r, width := utf8.DecodeRune(s[i:])
1888                 i += width
1889                 switch r {
1890                 case '"':
1891                         esc = esc_quot
1892                 case '\'':
1893                         esc = esc_apos
1894                 case '&':
1895                         esc = esc_amp
1896                 case '<':
1897                         esc = esc_lt
1898                 case '>':
1899                         esc = esc_gt
1900                 case '\t':
1901                         esc = esc_tab
1902                 case '\n':
1903                         if !escapeNewline {
1904                                 continue
1905                         }
1906                         esc = esc_nl
1907                 case '\r':
1908                         esc = esc_cr
1909                 default:
1910                         if !isInCharacterRange(r) || (r == 0xFFFD && width == 1) {
1911                                 esc = esc_fffd
1912                                 break
1913                         }
1914                         continue
1915                 }
1916                 if _, err := w.Write(s[last : i-width]); err != nil {
1917                         return err
1918                 }
1919                 if _, err := w.Write(esc); err != nil {
1920                         return err
1921                 }
1922                 last = i
1923         }
1924         if _, err := w.Write(s[last:]); err != nil {
1925                 return err
1926         }
1927         return nil
1928 }
1929
1930 // EscapeString writes to p the properly escaped XML equivalent
1931 // of the plain text data s.
1932 func (p *printer) EscapeString(s string) {
1933         var esc []byte
1934         last := 0
1935         for i := 0; i < len(s); {
1936                 r, width := utf8.DecodeRuneInString(s[i:])
1937                 i += width
1938                 switch r {
1939                 case '"':
1940                         esc = esc_quot
1941                 case '\'':
1942                         esc = esc_apos
1943                 case '&':
1944                         esc = esc_amp
1945                 case '<':
1946                         esc = esc_lt
1947                 case '>':
1948                         esc = esc_gt
1949                 case '\t':
1950                         esc = esc_tab
1951                 case '\n':
1952                         esc = esc_nl
1953                 case '\r':
1954                         esc = esc_cr
1955                 default:
1956                         if !isInCharacterRange(r) || (r == 0xFFFD && width == 1) {
1957                                 esc = esc_fffd
1958                                 break
1959                         }
1960                         continue
1961                 }
1962                 p.WriteString(s[last : i-width])
1963                 p.Write(esc)
1964                 last = i
1965         }
1966         p.WriteString(s[last:])
1967 }
1968
1969 // Escape is like EscapeText but omits the error return value.
1970 // It is provided for backwards compatibility with Go 1.0.
1971 // Code targeting Go 1.1 or later should use EscapeText.
1972 func Escape(w io.Writer, s []byte) {
1973         EscapeText(w, s)
1974 }
1975
1976 // procInst parses the `param="..."` or `param='...'`
1977 // value out of the provided string, returning "" if not found.
1978 func procInst(param, s string) string {
1979         // TODO: this parsing is somewhat lame and not exact.
1980         // It works for all actual cases, though.
1981         param = param + "="
1982         idx := strings.Index(s, param)
1983         if idx == -1 {
1984                 return ""
1985         }
1986         v := s[idx+len(param):]
1987         if v == "" {
1988                 return ""
1989         }
1990         if v[0] != '\'' && v[0] != '"' {
1991                 return ""
1992         }
1993         idx = strings.IndexRune(v[1:], rune(v[0]))
1994         if idx == -1 {
1995                 return ""
1996         }
1997         return v[1 : idx+1]
1998 }