vendor/golang.org/x/net/webdav/internal/xml/xml.go

   1 // Copyright 2009 The Go Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style
   3 // license that can be found in the LICENSE file.
   4
   5 // Package xml implements a simple XML 1.0 parser that
   6 // understands XML name spaces.
   7 package xml
   8
   9 // References:
  10 //    Annotated XML spec: http://www.xml.com/axml/testaxml.htm
  11 //    XML name spaces: http://www.w3.org/TR/REC-xml-names/
  12
  13 // TODO(rsc):
  14 //      Test error handling.
  15
  16 import (
  17         "bufio"
  18         "bytes"
  19         "errors"
  20         "fmt"
  21         "io"
  22         "strconv"
  23         "strings"
  24         "unicode"
  25         "unicode/utf8"
  26 )
  27
  28 // A SyntaxError represents a syntax error in the XML input stream.
  29 type SyntaxError struct {
  30         Msg  string
  31         Line int
  32 }
  33
  34 func (e *SyntaxError) Error() string {
  35         return "XML syntax error on line " + strconv.Itoa(e.Line) + ": " + e.Msg
  36 }
  37
  38 // A Name represents an XML name (Local) annotated with a name space
  39 // identifier (Space). In tokens returned by Decoder.Token, the Space
  40 // identifier is given as a canonical URL, not the short prefix used in
  41 // the document being parsed.
  42 //
  43 // As a special case, XML namespace declarations will use the literal
  44 // string "xmlns" for the Space field instead of the fully resolved URL.
  45 // See Encoder.EncodeToken for more information on namespace encoding
  46 // behaviour.
  47 type Name struct {
  48         Space, Local string
  49 }
  50
  51 // isNamespace reports whether the name is a namespace-defining name.
  52 func (name Name) isNamespace() bool {
  53         return name.Local == "xmlns" || name.Space == "xmlns"
  54 }
  55
  56 // An Attr represents an attribute in an XML element (Name=Value).
  57 type Attr struct {
  58         Name  Name
  59         Value string
  60 }
  61
  62 // A Token is an interface holding one of the token types:
  63 // StartElement, EndElement, CharData, Comment, ProcInst, or Directive.
  64 type Token interface{}
  65
  66 // A StartElement represents an XML start element.
  67 type StartElement struct {
  68         Name Name
  69         Attr []Attr
  70 }
  71
  72 func (e StartElement) Copy() StartElement {
  73         attrs := make([]Attr, len(e.Attr))
  74         copy(attrs, e.Attr)
  75         e.Attr = attrs
  76         return e
  77 }
  78
  79 // End returns the corresponding XML end element.
  80 func (e StartElement) End() EndElement {
  81         return EndElement{e.Name}
  82 }
  83
  84 // setDefaultNamespace sets the namespace of the element
  85 // as the default for all elements contained within it.
  86 func (e *StartElement) setDefaultNamespace() {
  87         if e.Name.Space == "" {
  88                 // If there's no namespace on the element, don't
  89                 // set the default. Strictly speaking this might be wrong, as
  90                 // we can't tell if the element had no namespace set
  91                 // or was just using the default namespace.
  92                 return
  93         }
  94         // Don't add a default name space if there's already one set.
  95         for _, attr := range e.Attr {
  96                 if attr.Name.Space == "" && attr.Name.Local == "xmlns" {
  97                         return
  98                 }
  99         }
 100         e.Attr = append(e.Attr, Attr{
 101                 Name: Name{
 102                         Local: "xmlns",
 103                 },
 104                 Value: e.Name.Space,
 105         })
 106 }
 107
 108 // An EndElement represents an XML end element.
 109 type EndElement struct {
 110         Name Name
 111 }
 112
 113 // A CharData represents XML character data (raw text),
 114 // in which XML escape sequences have been replaced by
 115 // the characters they represent.
 116 type CharData []byte
 117
 118 func makeCopy(b []byte) []byte {
 119         b1 := make([]byte, len(b))
 120         copy(b1, b)
 121         return b1
 122 }
 123
 124 func (c CharData) Copy() CharData { return CharData(makeCopy(c)) }
 125
 126 // A Comment represents an XML comment of the form <!--comment-->.
 127 // The bytes do not include the <!-- and --> comment markers.
 128 type Comment []byte
 129
 130 func (c Comment) Copy() Comment { return Comment(makeCopy(c)) }
 131
 132 // A ProcInst represents an XML processing instruction of the form <?target inst?>
 133 type ProcInst struct {
 134         Target string
 135         Inst   []byte
 136 }
 137
 138 func (p ProcInst) Copy() ProcInst {
 139         p.Inst = makeCopy(p.Inst)
 140         return p
 141 }
 142
 143 // A Directive represents an XML directive of the form <!text>.
 144 // The bytes do not include the <! and > markers.
 145 type Directive []byte
 146
 147 func (d Directive) Copy() Directive { return Directive(makeCopy(d)) }
 148
 149 // CopyToken returns a copy of a Token.
 150 func CopyToken(t Token) Token {
 151         switch v := t.(type) {
 152         case CharData:
 153                 return v.Copy()
 154         case Comment:
 155                 return v.Copy()
 156         case Directive:
 157                 return v.Copy()
 158         case ProcInst:
 159                 return v.Copy()
 160         case StartElement:
 161                 return v.Copy()
 162         }
 163         return t
 164 }
 165
 166 // A Decoder represents an XML parser reading a particular input stream.
 167 // The parser assumes that its input is encoded in UTF-8.
 168 type Decoder struct {
 169         // Strict defaults to true, enforcing the requirements
 170         // of the XML specification.
 171         // If set to false, the parser allows input containing common
 172         // mistakes:
 173         //      * If an element is missing an end tag, the parser invents
 174         //        end tags as necessary to keep the return values from Token
 175         //        properly balanced.
 176         //      * In attribute values and character data, unknown or malformed
 177         //        character entities (sequences beginning with &) are left alone.
 178         //
 179         // Setting:
 180         //
 181         //      d.Strict = false;
 182         //      d.AutoClose = HTMLAutoClose;
 183         //      d.Entity = HTMLEntity
 184         //
 185         // creates a parser that can handle typical HTML.
 186         //
 187         // Strict mode does not enforce the requirements of the XML name spaces TR.
 188         // In particular it does not reject name space tags using undefined prefixes.
 189         // Such tags are recorded with the unknown prefix as the name space URL.
 190         Strict bool
 191
 192         // When Strict == false, AutoClose indicates a set of elements to
 193         // consider closed immediately after they are opened, regardless
 194         // of whether an end element is present.
 195         AutoClose []string
 196
 197         // Entity can be used to map non-standard entity names to string replacements.
 198         // The parser behaves as if these standard mappings are present in the map,
 199         // regardless of the actual map content:
 200         //
 201         //      "lt": "<",
 202         //      "gt": ">",
 203         //      "amp": "&",
 204         //      "apos": "'",
 205         //      "quot": `"`,
 206         Entity map[string]string
 207
 208         // CharsetReader, if non-nil, defines a function to generate
 209         // charset-conversion readers, converting from the provided
 210         // non-UTF-8 charset into UTF-8. If CharsetReader is nil or
 211         // returns an error, parsing stops with an error. One of the
 212         // the CharsetReader's result values must be non-nil.
 213         CharsetReader func(charset string, input io.Reader) (io.Reader, error)
 214
 215         // DefaultSpace sets the default name space used for unadorned tags,
 216         // as if the entire XML stream were wrapped in an element containing
 217         // the attribute xmlns="DefaultSpace".
 218         DefaultSpace string
 219
 220         r              io.ByteReader
 221         buf            bytes.Buffer
 222         saved          *bytes.Buffer
 223         stk            *stack
 224         free           *stack
 225         needClose      bool
 226         toClose        Name
 227         nextToken      Token
 228         nextByte       int
 229         ns             map[string]string
 230         err            error
 231         line           int
 232         offset         int64
 233         unmarshalDepth int
 234 }
 235
 236 // NewDecoder creates a new XML parser reading from r.
 237 // If r does not implement io.ByteReader, NewDecoder will
 238 // do its own buffering.
 239 func NewDecoder(r io.Reader) *Decoder {
 240         d := &Decoder{
 241                 ns:       make(map[string]string),
 242                 nextByte: -1,
 243                 line:     1,
 244                 Strict:   true,
 245         }
 246         d.switchToReader(r)
 247         return d
 248 }
 249
 250 // Token returns the next XML token in the input stream.
 251 // At the end of the input stream, Token returns nil, io.EOF.
 252 //
 253 // Slices of bytes in the returned token data refer to the
 254 // parser's internal buffer and remain valid only until the next
 255 // call to Token. To acquire a copy of the bytes, call CopyToken
 256 // or the token's Copy method.
 257 //
 258 // Token expands self-closing elements such as <br/>
 259 // into separate start and end elements returned by successive calls.
 260 //
 261 // Token guarantees that the StartElement and EndElement
 262 // tokens it returns are properly nested and matched:
 263 // if Token encounters an unexpected end element,
 264 // it will return an error.
 265 //
 266 // Token implements XML name spaces as described by
 267 // http://www.w3.org/TR/REC-xml-names/.  Each of the
 268 // Name structures contained in the Token has the Space
 269 // set to the URL identifying its name space when known.
 270 // If Token encounters an unrecognized name space prefix,
 271 // it uses the prefix as the Space rather than report an error.
 272 func (d *Decoder) Token() (t Token, err error) {
 273         if d.stk != nil && d.stk.kind == stkEOF {
 274                 err = io.EOF
 275                 return
 276         }
 277         if d.nextToken != nil {
 278                 t = d.nextToken
 279                 d.nextToken = nil
 280         } else if t, err = d.rawToken(); err != nil {
 281                 return
 282         }
 283
 284         if !d.Strict {
 285                 if t1, ok := d.autoClose(t); ok {
 286                         d.nextToken = t
 287                         t = t1
 288                 }
 289         }
 290         switch t1 := t.(type) {
 291         case StartElement:
 292                 // In XML name spaces, the translations listed in the
 293                 // attributes apply to the element name and
 294                 // to the other attribute names, so process
 295                 // the translations first.
 296                 for _, a := range t1.Attr {
 297                         if a.Name.Space == "xmlns" {
 298                                 v, ok := d.ns[a.Name.Local]
 299                                 d.pushNs(a.Name.Local, v, ok)
 300                                 d.ns[a.Name.Local] = a.Value
 301                         }
 302                         if a.Name.Space == "" && a.Name.Local == "xmlns" {
 303                                 // Default space for untagged names
 304                                 v, ok := d.ns[""]
 305                                 d.pushNs("", v, ok)
 306                                 d.ns[""] = a.Value
 307                         }
 308                 }
 309
 310                 d.translate(&t1.Name, true)
 311                 for i := range t1.Attr {
 312                         d.translate(&t1.Attr[i].Name, false)
 313                 }
 314                 d.pushElement(t1.Name)
 315                 t = t1
 316
 317         case EndElement:
 318                 d.translate(&t1.Name, true)
 319                 if !d.popElement(&t1) {
 320                         return nil, d.err
 321                 }
 322                 t = t1
 323         }
 324         return
 325 }
 326
 327 const xmlURL = "http://www.w3.org/XML/1998/namespace"
 328
 329 // Apply name space translation to name n.
 330 // The default name space (for Space=="")
 331 // applies only to element names, not to attribute names.
 332 func (d *Decoder) translate(n *Name, isElementName bool) {
 333         switch {
 334         case n.Space == "xmlns":
 335                 return
 336         case n.Space == "" && !isElementName:
 337                 return
 338         case n.Space == "xml":
 339                 n.Space = xmlURL
 340         case n.Space == "" && n.Local == "xmlns":
 341                 return
 342         }
 343         if v, ok := d.ns[n.Space]; ok {
 344                 n.Space = v
 345         } else if n.Space == "" {
 346                 n.Space = d.DefaultSpace
 347         }
 348 }
 349
 350 func (d *Decoder) switchToReader(r io.Reader) {
 351         // Get efficient byte at a time reader.
 352         // Assume that if reader has its own
 353         // ReadByte, it's efficient enough.
 354         // Otherwise, use bufio.
 355         if rb, ok := r.(io.ByteReader); ok {
 356                 d.r = rb
 357         } else {
 358                 d.r = bufio.NewReader(r)
 359         }
 360 }
 361
 362 // Parsing state - stack holds old name space translations
 363 // and the current set of open elements. The translations to pop when
 364 // ending a given tag are *below* it on the stack, which is
 365 // more work but forced on us by XML.
 366 type stack struct {
 367         next *stack
 368         kind int
 369         name Name
 370         ok   bool
 371 }
 372
 373 const (
 374         stkStart = iota
 375         stkNs
 376         stkEOF
 377 )
 378
 379 func (d *Decoder) push(kind int) *stack {
 380         s := d.free
 381         if s != nil {
 382                 d.free = s.next
 383         } else {
 384                 s = new(stack)
 385         }
 386         s.next = d.stk
 387         s.kind = kind
 388         d.stk = s
 389         return s
 390 }
 391
 392 func (d *Decoder) pop() *stack {
 393         s := d.stk
 394         if s != nil {
 395                 d.stk = s.next
 396                 s.next = d.free
 397                 d.free = s
 398         }
 399         return s
 400 }
 401
 402 // Record that after the current element is finished
 403 // (that element is already pushed on the stack)
 404 // Token should return EOF until popEOF is called.
 405 func (d *Decoder) pushEOF() {
 406         // Walk down stack to find Start.
 407         // It might not be the top, because there might be stkNs
 408         // entries above it.
 409         start := d.stk
 410         for start.kind != stkStart {
 411                 start = start.next
 412         }
 413         // The stkNs entries below a start are associated with that
 414         // element too; skip over them.
 415         for start.next != nil && start.next.kind == stkNs {
 416                 start = start.next
 417         }
 418         s := d.free
 419         if s != nil {
 420                 d.free = s.next
 421         } else {
 422                 s = new(stack)
 423         }
 424         s.kind = stkEOF
 425         s.next = start.next
 426         start.next = s
 427 }
 428
 429 // Undo a pushEOF.
 430 // The element must have been finished, so the EOF should be at the top of the stack.
 431 func (d *Decoder) popEOF() bool {
 432         if d.stk == nil || d.stk.kind != stkEOF {
 433                 return false
 434         }
 435         d.pop()
 436         return true
 437 }
 438
 439 // Record that we are starting an element with the given name.
 440 func (d *Decoder) pushElement(name Name) {
 441         s := d.push(stkStart)
 442         s.name = name
 443 }
 444
 445 // Record that we are changing the value of ns[local].
 446 // The old value is url, ok.
 447 func (d *Decoder) pushNs(local string, url string, ok bool) {
 448         s := d.push(stkNs)
 449         s.name.Local = local
 450         s.name.Space = url
 451         s.ok = ok
 452 }
 453
 454 // Creates a SyntaxError with the current line number.
 455 func (d *Decoder) syntaxError(msg string) error {
 456         return &SyntaxError{Msg: msg, Line: d.line}
 457 }
 458
 459 // Record that we are ending an element with the given name.
 460 // The name must match the record at the top of the stack,
 461 // which must be a pushElement record.
 462 // After popping the element, apply any undo records from
 463 // the stack to restore the name translations that existed
 464 // before we saw this element.
 465 func (d *Decoder) popElement(t *EndElement) bool {
 466         s := d.pop()
 467         name := t.Name
 468         switch {
 469         case s == nil || s.kind != stkStart:
 470                 d.err = d.syntaxError("unexpected end element </" + name.Local + ">")
 471                 return false
 472         case s.name.Local != name.Local:
 473                 if !d.Strict {
 474                         d.needClose = true
 475                         d.toClose = t.Name
 476                         t.Name = s.name
 477                         return true
 478                 }
 479                 d.err = d.syntaxError("element <" + s.name.Local + "> closed by </" + name.Local + ">")
 480                 return false
 481         case s.name.Space != name.Space:
 482                 d.err = d.syntaxError("element <" + s.name.Local + "> in space " + s.name.Space +
 483                         "closed by </" + name.Local + "> in space " + name.Space)
 484                 return false
 485         }
 486
 487         // Pop stack until a Start or EOF is on the top, undoing the
 488         // translations that were associated with the element we just closed.
 489         for d.stk != nil && d.stk.kind != stkStart && d.stk.kind != stkEOF {
 490                 s := d.pop()
 491                 if s.ok {
 492                         d.ns[s.name.Local] = s.name.Space
 493                 } else {
 494                         delete(d.ns, s.name.Local)
 495                 }
 496         }
 497
 498         return true
 499 }
 500
 501 // If the top element on the stack is autoclosing and
 502 // t is not the end tag, invent the end tag.
 503 func (d *Decoder) autoClose(t Token) (Token, bool) {
 504         if d.stk == nil || d.stk.kind != stkStart {
 505                 return nil, false
 506         }
 507         name := strings.ToLower(d.stk.name.Local)
 508         for _, s := range d.AutoClose {
 509                 if strings.ToLower(s) == name {
 510                         // This one should be auto closed if t doesn't close it.
 511                         et, ok := t.(EndElement)
 512                         if !ok || et.Name.Local != name {
 513                                 return EndElement{d.stk.name}, true
 514                         }
 515                         break
 516                 }
 517         }
 518         return nil, false
 519 }
 520
 521 var errRawToken = errors.New("xml: cannot use RawToken from UnmarshalXML method")
 522
 523 // RawToken is like Token but does not verify that
 524 // start and end elements match and does not translate
 525 // name space prefixes to their corresponding URLs.
 526 func (d *Decoder) RawToken() (Token, error) {
 527         if d.unmarshalDepth > 0 {
 528                 return nil, errRawToken
 529         }
 530         return d.rawToken()
 531 }
 532
 533 func (d *Decoder) rawToken() (Token, error) {
 534         if d.err != nil {
 535                 return nil, d.err
 536         }
 537         if d.needClose {
 538                 // The last element we read was self-closing and
 539                 // we returned just the StartElement half.
 540                 // Return the EndElement half now.
 541                 d.needClose = false
 542                 return EndElement{d.toClose}, nil
 543         }
 544
 545         b, ok := d.getc()
 546         if !ok {
 547                 return nil, d.err
 548         }
 549
 550         if b != '<' {
 551                 // Text section.
 552                 d.ungetc(b)
 553                 data := d.text(-1, false)
 554                 if data == nil {
 555                         return nil, d.err
 556                 }
 557                 return CharData(data), nil
 558         }
 559
 560         if b, ok = d.mustgetc(); !ok {
 561                 return nil, d.err
 562         }
 563         switch b {
 564         case '/':
 565                 // </: End element
 566                 var name Name
 567                 if name, ok = d.nsname(); !ok {
 568                         if d.err == nil {
 569                                 d.err = d.syntaxError("expected element name after </")
 570                         }
 571                         return nil, d.err
 572                 }
 573                 d.space()
 574                 if b, ok = d.mustgetc(); !ok {
 575                         return nil, d.err
 576                 }
 577                 if b != '>' {
 578                         d.err = d.syntaxError("invalid characters between </" + name.Local + " and >")
 579                         return nil, d.err
 580                 }
 581                 return EndElement{name}, nil
 582
 583         case '?':
 584                 // <?: Processing instruction.
 585                 var target string
 586                 if target, ok = d.name(); !ok {
 587                         if d.err == nil {
 588                                 d.err = d.syntaxError("expected target name after <?")
 589                         }
 590                         return nil, d.err
 591                 }
 592                 d.space()
 593                 d.buf.Reset()
 594                 var b0 byte
 595                 for {
 596                         if b, ok = d.mustgetc(); !ok {
 597                                 return nil, d.err
 598                         }
 599                         d.buf.WriteByte(b)
 600                         if b0 == '?' && b == '>' {
 601                                 break
 602                         }
 603                         b0 = b
 604                 }
 605                 data := d.buf.Bytes()
 606                 data = data[0 : len(data)-2] // chop ?>
 607
 608                 if target == "xml" {
 609                         content := string(data)
 610                         ver := procInst("version", content)
 611                         if ver != "" && ver != "1.0" {
 612                                 d.err = fmt.Errorf("xml: unsupported version %q; only version 1.0 is supported", ver)
 613                                 return nil, d.err
 614                         }
 615                         enc := procInst("encoding", content)
 616                         if enc != "" && enc != "utf-8" && enc != "UTF-8" {
 617                                 if d.CharsetReader == nil {
 618                                         d.err = fmt.Errorf("xml: encoding %q declared but Decoder.CharsetReader is nil", enc)
 619                                         return nil, d.err
 620                                 }
 621                                 newr, err := d.CharsetReader(enc, d.r.(io.Reader))
 622                                 if err != nil {
 623                                         d.err = fmt.Errorf("xml: opening charset %q: %v", enc, err)
 624                                         return nil, d.err
 625                                 }
 626                                 if newr == nil {
 627                                         panic("CharsetReader returned a nil Reader for charset " + enc)
 628                                 }
 629                                 d.switchToReader(newr)
 630                         }
 631                 }
 632                 return ProcInst{target, data}, nil
 633
 634         case '!':
 635                 // <!: Maybe comment, maybe CDATA.
 636                 if b, ok = d.mustgetc(); !ok {
 637                         return nil, d.err
 638                 }
 639                 switch b {
 640                 case '-': // <!-
 641                         // Probably <!-- for a comment.
 642                         if b, ok = d.mustgetc(); !ok {
 643                                 return nil, d.err
 644                         }
 645                         if b != '-' {
 646                                 d.err = d.syntaxError("invalid sequence <!- not part of <!--")
 647                                 return nil, d.err
 648                         }
 649                         // Look for terminator.
 650                         d.buf.Reset()
 651                         var b0, b1 byte
 652                         for {
 653                                 if b, ok = d.mustgetc(); !ok {
 654                                         return nil, d.err
 655                                 }
 656                                 d.buf.WriteByte(b)
 657                                 if b0 == '-' && b1 == '-' && b == '>' {
 658                                         break
 659                                 }
 660                                 b0, b1 = b1, b
 661                         }
 662                         data := d.buf.Bytes()
 663                         data = data[0 : len(data)-3] // chop -->
 664                         return Comment(data), nil
 665
 666                 case '[': // <![
 667                         // Probably <![CDATA[.
 668                         for i := 0; i < 6; i++ {
 669                                 if b, ok = d.mustgetc(); !ok {
 670                                         return nil, d.err
 671                                 }
 672                                 if b != "CDATA["[i] {
 673                                         d.err = d.syntaxError("invalid <![ sequence")
 674                                         return nil, d.err
 675                                 }
 676                         }
 677                         // Have <![CDATA[.  Read text until ]]>.
 678                         data := d.text(-1, true)
 679                         if data == nil {
 680                                 return nil, d.err
 681                         }
 682                         return CharData(data), nil
 683                 }
 684
 685                 // Probably a directive: <!DOCTYPE ...>, <!ENTITY ...>, etc.
 686                 // We don't care, but accumulate for caller. Quoted angle
 687                 // brackets do not count for nesting.
 688                 d.buf.Reset()
 689                 d.buf.WriteByte(b)
 690                 inquote := uint8(0)
 691                 depth := 0
 692                 for {
 693                         if b, ok = d.mustgetc(); !ok {
 694                                 return nil, d.err
 695                         }
 696                         if inquote == 0 && b == '>' && depth == 0 {
 697                                 break
 698                         }
 699                 HandleB:
 700                         d.buf.WriteByte(b)
 701                         switch {
 702                         case b == inquote:
 703                                 inquote = 0
 704
 705                         case inquote != 0:
 706                                 // in quotes, no special action
 707
 708                         case b == '\'' || b == '"':
 709                                 inquote = b
 710
 711                         case b == '>' && inquote == 0:
 712                                 depth--
 713
 714                         case b == '<' && inquote == 0:
 715                                 // Look for <!-- to begin comment.
 716                                 s := "!--"
 717                                 for i := 0; i < len(s); i++ {
 718                                         if b, ok = d.mustgetc(); !ok {
 719                                                 return nil, d.err
 720                                         }
 721                                         if b != s[i] {
 722                                                 for j := 0; j < i; j++ {
 723                                                         d.buf.WriteByte(s[j])
 724                                                 }
 725                                                 depth++
 726                                                 goto HandleB
 727                                         }
 728                                 }
 729
 730                                 // Remove < that was written above.
 731                                 d.buf.Truncate(d.buf.Len() - 1)
 732
 733                                 // Look for terminator.
 734                                 var b0, b1 byte
 735                                 for {
 736                                         if b, ok = d.mustgetc(); !ok {
 737                                                 return nil, d.err
 738                                         }
 739                                         if b0 == '-' && b1 == '-' && b == '>' {
 740                                                 break
 741                                         }
 742                                         b0, b1 = b1, b
 743                                 }
 744                         }
 745                 }
 746                 return Directive(d.buf.Bytes()), nil
 747         }
 748
 749         // Must be an open element like <a href="foo">
 750         d.ungetc(b)
 751
 752         var (
 753                 name  Name
 754                 empty bool
 755                 attr  []Attr
 756         )
 757         if name, ok = d.nsname(); !ok {
 758                 if d.err == nil {
 759                         d.err = d.syntaxError("expected element name after <")
 760                 }
 761                 return nil, d.err
 762         }
 763
 764         attr = []Attr{}
 765         for {
 766                 d.space()
 767                 if b, ok = d.mustgetc(); !ok {
 768                         return nil, d.err
 769                 }
 770                 if b == '/' {
 771                         empty = true
 772                         if b, ok = d.mustgetc(); !ok {
 773                                 return nil, d.err
 774                         }
 775                         if b != '>' {
 776                                 d.err = d.syntaxError("expected /> in element")
 777                                 return nil, d.err
 778                         }
 779                         break
 780                 }
 781                 if b == '>' {
 782                         break
 783                 }
 784                 d.ungetc(b)
 785
 786                 n := len(attr)
 787                 if n >= cap(attr) {
 788                         nCap := 2 * cap(attr)
 789                         if nCap == 0 {
 790                                 nCap = 4
 791                         }
 792                         nattr := make([]Attr, n, nCap)
 793                         copy(nattr, attr)
 794                         attr = nattr
 795                 }
 796                 attr = attr[0 : n+1]
 797                 a := &attr[n]
 798                 if a.Name, ok = d.nsname(); !ok {
 799                         if d.err == nil {
 800                                 d.err = d.syntaxError("expected attribute name in element")
 801                         }
 802                         return nil, d.err
 803                 }
 804                 d.space()
 805                 if b, ok = d.mustgetc(); !ok {
 806                         return nil, d.err
 807                 }
 808                 if b != '=' {
 809                         if d.Strict {
 810                                 d.err = d.syntaxError("attribute name without = in element")
 811                                 return nil, d.err
 812                         } else {
 813                                 d.ungetc(b)
 814                                 a.Value = a.Name.Local
 815                         }
 816                 } else {
 817                         d.space()
 818                         data := d.attrval()
 819                         if data == nil {
 820                                 return nil, d.err
 821                         }
 822                         a.Value = string(data)
 823                 }
 824         }
 825         if empty {
 826                 d.needClose = true
 827                 d.toClose = name
 828         }
 829         return StartElement{name, attr}, nil
 830 }
 831
 832 func (d *Decoder) attrval() []byte {
 833         b, ok := d.mustgetc()
 834         if !ok {
 835                 return nil
 836         }
 837         // Handle quoted attribute values
 838         if b == '"' || b == '\'' {
 839                 return d.text(int(b), false)
 840         }
 841         // Handle unquoted attribute values for strict parsers
 842         if d.Strict {
 843                 d.err = d.syntaxError("unquoted or missing attribute value in element")
 844                 return nil
 845         }
 846         // Handle unquoted attribute values for unstrict parsers
 847         d.ungetc(b)
 848         d.buf.Reset()
 849         for {
 850                 b, ok = d.mustgetc()
 851                 if !ok {
 852                         return nil
 853                 }
 854                 // http://www.w3.org/TR/REC-html40/intro/sgmltut.html#h-3.2.2
 855                 if 'a' <= b && b <= 'z' || 'A' <= b && b <= 'Z' ||
 856                         '0' <= b && b <= '9' || b == '_' || b == ':' || b == '-' {
 857                         d.buf.WriteByte(b)
 858                 } else {
 859                         d.ungetc(b)
 860                         break
 861                 }
 862         }
 863         return d.buf.Bytes()
 864 }
 865
 866 // Skip spaces if any
 867 func (d *Decoder) space() {
 868         for {
 869                 b, ok := d.getc()
 870                 if !ok {
 871                         return
 872                 }
 873                 switch b {
 874                 case ' ', '\r', '\n', '\t':
 875                 default:
 876                         d.ungetc(b)
 877                         return
 878                 }
 879         }
 880 }
 881
 882 // Read a single byte.
 883 // If there is no byte to read, return ok==false
 884 // and leave the error in d.err.
 885 // Maintain line number.
 886 func (d *Decoder) getc() (b byte, ok bool) {
 887         if d.err != nil {
 888                 return 0, false
 889         }
 890         if d.nextByte >= 0 {
 891                 b = byte(d.nextByte)
 892                 d.nextByte = -1
 893         } else {
 894                 b, d.err = d.r.ReadByte()
 895                 if d.err != nil {
 896                         return 0, false
 897                 }
 898                 if d.saved != nil {
 899                         d.saved.WriteByte(b)
 900                 }
 901         }
 902         if b == '\n' {
 903                 d.line++
 904         }
 905         d.offset++
 906         return b, true
 907 }
 908
 909 // InputOffset returns the input stream byte offset of the current decoder position.
 910 // The offset gives the location of the end of the most recently returned token
 911 // and the beginning of the next token.
 912 func (d *Decoder) InputOffset() int64 {
 913         return d.offset
 914 }
 915
 916 // Return saved offset.
 917 // If we did ungetc (nextByte >= 0), have to back up one.
 918 func (d *Decoder) savedOffset() int {
 919         n := d.saved.Len()
 920         if d.nextByte >= 0 {
 921                 n--
 922         }
 923         return n
 924 }
 925
 926 // Must read a single byte.
 927 // If there is no byte to read,
 928 // set d.err to SyntaxError("unexpected EOF")
 929 // and return ok==false
 930 func (d *Decoder) mustgetc() (b byte, ok bool) {
 931         if b, ok = d.getc(); !ok {
 932                 if d.err == io.EOF {
 933                         d.err = d.syntaxError("unexpected EOF")
 934                 }
 935         }
 936         return
 937 }
 938
 939 // Unread a single byte.
 940 func (d *Decoder) ungetc(b byte) {
 941         if b == '\n' {
 942                 d.line--
 943         }
 944         d.nextByte = int(b)
 945         d.offset--
 946 }
 947
 948 var entity = map[string]int{
 949         "lt":   '<',
 950         "gt":   '>',
 951         "amp":  '&',
 952         "apos": '\'',
 953         "quot": '"',
 954 }
 955
 956 // Read plain text section (XML calls it character data).
 957 // If quote >= 0, we are in a quoted string and need to find the matching quote.
 958 // If cdata == true, we are in a <![CDATA[ section and need to find ]]>.
 959 // On failure return nil and leave the error in d.err.
 960 func (d *Decoder) text(quote int, cdata bool) []byte {
 961         var b0, b1 byte
 962         var trunc int
 963         d.buf.Reset()
 964 Input:
 965         for {
 966                 b, ok := d.getc()
 967                 if !ok {
 968                         if cdata {
 969                                 if d.err == io.EOF {
 970                                         d.err = d.syntaxError("unexpected EOF in CDATA section")
 971                                 }
 972                                 return nil
 973                         }
 974                         break Input
 975                 }
 976
 977                 // <![CDATA[ section ends with ]]>.
 978                 // It is an error for ]]> to appear in ordinary text.
 979                 if b0 == ']' && b1 == ']' && b == '>' {
 980                         if cdata {
 981                                 trunc = 2
 982                                 break Input
 983                         }
 984                         d.err = d.syntaxError("unescaped ]]> not in CDATA section")
 985                         return nil
 986                 }
 987
 988                 // Stop reading text if we see a <.
 989                 if b == '<' && !cdata {
 990                         if quote >= 0 {
 991                                 d.err = d.syntaxError("unescaped < inside quoted string")
 992                                 return nil
 993                         }
 994                         d.ungetc('<')
 995                         break Input
 996                 }
 997                 if quote >= 0 && b == byte(quote) {
 998                         break Input
 999                 }
1000                 if b == '&' && !cdata {
1001                         // Read escaped character expression up to semicolon.
1002                         // XML in all its glory allows a document to define and use
1003                         // its own character names with <!ENTITY ...> directives.
1004                         // Parsers are required to recognize lt, gt, amp, apos, and quot
1005                         // even if they have not been declared.
1006                         before := d.buf.Len()
1007                         d.buf.WriteByte('&')
1008                         var ok bool
1009                         var text string
1010                         var haveText bool
1011                         if b, ok = d.mustgetc(); !ok {
1012                                 return nil
1013                         }
1014                         if b == '#' {
1015                                 d.buf.WriteByte(b)
1016                                 if b, ok = d.mustgetc(); !ok {
1017                                         return nil
1018                                 }
1019                                 base := 10
1020                                 if b == 'x' {
1021                                         base = 16
1022                                         d.buf.WriteByte(b)
1023                                         if b, ok = d.mustgetc(); !ok {
1024                                                 return nil
1025                                         }
1026                                 }
1027                                 start := d.buf.Len()
1028                                 for '0' <= b && b <= '9' ||
1029                                         base == 16 && 'a' <= b && b <= 'f' ||
1030                                         base == 16 && 'A' <= b && b <= 'F' {
1031                                         d.buf.WriteByte(b)
1032                                         if b, ok = d.mustgetc(); !ok {
1033                                                 return nil
1034                                         }
1035                                 }
1036                                 if b != ';' {
1037                                         d.ungetc(b)
1038                                 } else {
1039                                         s := string(d.buf.Bytes()[start:])
1040                                         d.buf.WriteByte(';')
1041                                         n, err := strconv.ParseUint(s, base, 64)
1042                                         if err == nil && n <= unicode.MaxRune {
1043                                                 text = string(n)
1044                                                 haveText = true
1045                                         }
1046                                 }
1047                         } else {
1048                                 d.ungetc(b)
1049                                 if !d.readName() {
1050                                         if d.err != nil {
1051                                                 return nil
1052                                         }
1053                                         ok = false
1054                                 }
1055                                 if b, ok = d.mustgetc(); !ok {
1056                                         return nil
1057                                 }
1058                                 if b != ';' {
1059                                         d.ungetc(b)
1060                                 } else {
1061                                         name := d.buf.Bytes()[before+1:]
1062                                         d.buf.WriteByte(';')
1063                                         if isName(name) {
1064                                                 s := string(name)
1065                                                 if r, ok := entity[s]; ok {
1066                                                         text = string(r)
1067                                                         haveText = true
1068                                                 } else if d.Entity != nil {
1069                                                         text, haveText = d.Entity[s]
1070                                                 }
1071                                         }
1072                                 }
1073                         }
1074
1075                         if haveText {
1076                                 d.buf.Truncate(before)
1077                                 d.buf.Write([]byte(text))
1078                                 b0, b1 = 0, 0
1079                                 continue Input
1080                         }
1081                         if !d.Strict {
1082                                 b0, b1 = 0, 0
1083                                 continue Input
1084                         }
1085                         ent := string(d.buf.Bytes()[before:])
1086                         if ent[len(ent)-1] != ';' {
1087                                 ent += " (no semicolon)"
1088                         }
1089                         d.err = d.syntaxError("invalid character entity " + ent)
1090                         return nil
1091                 }
1092
1093                 // We must rewrite unescaped \r and \r\n into \n.
1094                 if b == '\r' {
1095                         d.buf.WriteByte('\n')
1096                 } else if b1 == '\r' && b == '\n' {
1097                         // Skip \r\n--we already wrote \n.
1098                 } else {
1099                         d.buf.WriteByte(b)
1100                 }
1101
1102                 b0, b1 = b1, b
1103         }
1104         data := d.buf.Bytes()
1105         data = data[0 : len(data)-trunc]
1106
1107         // Inspect each rune for being a disallowed character.
1108         buf := data
1109         for len(buf) > 0 {
1110                 r, size := utf8.DecodeRune(buf)
1111                 if r == utf8.RuneError && size == 1 {
1112                         d.err = d.syntaxError("invalid UTF-8")
1113                         return nil
1114                 }
1115                 buf = buf[size:]
1116                 if !isInCharacterRange(r) {
1117                         d.err = d.syntaxError(fmt.Sprintf("illegal character code %U", r))
1118                         return nil
1119                 }
1120         }
1121
1122         return data
1123 }
1124
1125 // Decide whether the given rune is in the XML Character Range, per
1126 // the Char production of http://www.xml.com/axml/testaxml.htm,
1127 // Section 2.2 Characters.
1128 func isInCharacterRange(r rune) (inrange bool) {
1129         return r == 0x09 ||
1130                 r == 0x0A ||
1131                 r == 0x0D ||
1132                 r >= 0x20 && r <= 0xDF77 ||
1133                 r >= 0xE000 && r <= 0xFFFD ||
1134                 r >= 0x10000 && r <= 0x10FFFF
1135 }
1136
1137 // Get name space name: name with a : stuck in the middle.
1138 // The part before the : is the name space identifier.
1139 func (d *Decoder) nsname() (name Name, ok bool) {
1140         s, ok := d.name()
1141         if !ok {
1142                 return
1143         }
1144         i := strings.Index(s, ":")
1145         if i < 0 {
1146                 name.Local = s
1147         } else {
1148                 name.Space = s[0:i]
1149                 name.Local = s[i+1:]
1150         }
1151         return name, true
1152 }
1153
1154 // Get name: /first(first|second)*/
1155 // Do not set d.err if the name is missing (unless unexpected EOF is received):
1156 // let the caller provide better context.
1157 func (d *Decoder) name() (s string, ok bool) {
1158         d.buf.Reset()
1159         if !d.readName() {
1160                 return "", false
1161         }
1162
1163         // Now we check the characters.
1164         b := d.buf.Bytes()
1165         if !isName(b) {
1166                 d.err = d.syntaxError("invalid XML name: " + string(b))
1167                 return "", false
1168         }
1169         return string(b), true
1170 }
1171
1172 // Read a name and append its bytes to d.buf.
1173 // The name is delimited by any single-byte character not valid in names.
1174 // All multi-byte characters are accepted; the caller must check their validity.
1175 func (d *Decoder) readName() (ok bool) {
1176         var b byte
1177         if b, ok = d.mustgetc(); !ok {
1178                 return
1179         }
1180         if b < utf8.RuneSelf && !isNameByte(b) {
1181                 d.ungetc(b)
1182                 return false
1183         }
1184         d.buf.WriteByte(b)
1185
1186         for {
1187                 if b, ok = d.mustgetc(); !ok {
1188                         return
1189                 }
1190                 if b < utf8.RuneSelf && !isNameByte(b) {
1191                         d.ungetc(b)
1192                         break
1193                 }
1194                 d.buf.WriteByte(b)
1195         }
1196         return true
1197 }
1198
1199 func isNameByte(c byte) bool {
1200         return 'A' <= c && c <= 'Z' ||
1201                 'a' <= c && c <= 'z' ||
1202                 '0' <= c && c <= '9' ||
1203                 c == '_' || c == ':' || c == '.' || c == '-'
1204 }
1205
1206 func isName(s []byte) bool {
1207         if len(s) == 0 {
1208                 return false
1209         }
1210         c, n := utf8.DecodeRune(s)
1211         if c == utf8.RuneError && n == 1 {
1212                 return false
1213         }
1214         if !unicode.Is(first, c) {
1215                 return false
1216         }
1217         for n < len(s) {
1218                 s = s[n:]
1219                 c, n = utf8.DecodeRune(s)
1220                 if c == utf8.RuneError && n == 1 {
1221                         return false
1222                 }
1223                 if !unicode.Is(first, c) && !unicode.Is(second, c) {
1224                         return false
1225                 }
1226         }
1227         return true
1228 }
1229
1230 func isNameString(s string) bool {
1231         if len(s) == 0 {
1232                 return false
1233         }
1234         c, n := utf8.DecodeRuneInString(s)
1235         if c == utf8.RuneError && n == 1 {
1236                 return false
1237         }
1238         if !unicode.Is(first, c) {
1239                 return false
1240         }
1241         for n < len(s) {
1242                 s = s[n:]
1243                 c, n = utf8.DecodeRuneInString(s)
1244                 if c == utf8.RuneError && n == 1 {
1245                         return false
1246                 }
1247                 if !unicode.Is(first, c) && !unicode.Is(second, c) {
1248                         return false
1249                 }
1250         }
1251         return true
1252 }
1253
1254 // These tables were generated by cut and paste from Appendix B of
1255 // the XML spec at http://www.xml.com/axml/testaxml.htm
1256 // and then reformatting. First corresponds to (Letter | '_' | ':')
1257 // and second corresponds to NameChar.
1258
1259 var first = &unicode.RangeTable{
1260         R16: []unicode.Range16{
1261                 {0x003A, 0x003A, 1},
1262                 {0x0041, 0x005A, 1},
1263                 {0x005F, 0x005F, 1},
1264                 {0x0061, 0x007A, 1},
1265                 {0x00C0, 0x00D6, 1},
1266                 {0x00D8, 0x00F6, 1},
1267                 {0x00F8, 0x00FF, 1},
1268                 {0x0100, 0x0131, 1},
1269                 {0x0134, 0x013E, 1},
1270                 {0x0141, 0x0148, 1},
1271                 {0x014A, 0x017E, 1},
1272                 {0x0180, 0x01C3, 1},
1273                 {0x01CD, 0x01F0, 1},
1274                 {0x01F4, 0x01F5, 1},
1275                 {0x01FA, 0x0217, 1},
1276                 {0x0250, 0x02A8, 1},
1277                 {0x02BB, 0x02C1, 1},
1278                 {0x0386, 0x0386, 1},
1279                 {0x0388, 0x038A, 1},
1280                 {0x038C, 0x038C, 1},
1281                 {0x038E, 0x03A1, 1},
1282                 {0x03A3, 0x03CE, 1},
1283                 {0x03D0, 0x03D6, 1},
1284                 {0x03DA, 0x03E0, 2},
1285                 {0x03E2, 0x03F3, 1},
1286                 {0x0401, 0x040C, 1},
1287                 {0x040E, 0x044F, 1},
1288                 {0x0451, 0x045C, 1},
1289                 {0x045E, 0x0481, 1},
1290                 {0x0490, 0x04C4, 1},
1291                 {0x04C7, 0x04C8, 1},
1292                 {0x04CB, 0x04CC, 1},
1293                 {0x04D0, 0x04EB, 1},
1294                 {0x04EE, 0x04F5, 1},
1295                 {0x04F8, 0x04F9, 1},
1296                 {0x0531, 0x0556, 1},
1297                 {0x0559, 0x0559, 1},
1298                 {0x0561, 0x0586, 1},
1299                 {0x05D0, 0x05EA, 1},
1300                 {0x05F0, 0x05F2, 1},
1301                 {0x0621, 0x063A, 1},
1302                 {0x0641, 0x064A, 1},
1303                 {0x0671, 0x06B7, 1},
1304                 {0x06BA, 0x06BE, 1},
1305                 {0x06C0, 0x06CE, 1},
1306                 {0x06D0, 0x06D3, 1},
1307                 {0x06D5, 0x06D5, 1},
1308                 {0x06E5, 0x06E6, 1},
1309                 {0x0905, 0x0939, 1},
1310                 {0x093D, 0x093D, 1},
1311                 {0x0958, 0x0961, 1},
1312                 {0x0985, 0x098C, 1},
1313                 {0x098F, 0x0990, 1},
1314                 {0x0993, 0x09A8, 1},
1315                 {0x09AA, 0x09B0, 1},
1316                 {0x09B2, 0x09B2, 1},
1317                 {0x09B6, 0x09B9, 1},
1318                 {0x09DC, 0x09DD, 1},
1319                 {0x09DF, 0x09E1, 1},
1320                 {0x09F0, 0x09F1, 1},
1321                 {0x0A05, 0x0A0A, 1},
1322                 {0x0A0F, 0x0A10, 1},
1323                 {0x0A13, 0x0A28, 1},
1324                 {0x0A2A, 0x0A30, 1},
1325                 {0x0A32, 0x0A33, 1},
1326                 {0x0A35, 0x0A36, 1},
1327                 {0x0A38, 0x0A39, 1},
1328                 {0x0A59, 0x0A5C, 1},
1329                 {0x0A5E, 0x0A5E, 1},
1330                 {0x0A72, 0x0A74, 1},
1331                 {0x0A85, 0x0A8B, 1},
1332                 {0x0A8D, 0x0A8D, 1},
1333                 {0x0A8F, 0x0A91, 1},
1334                 {0x0A93, 0x0AA8, 1},
1335                 {0x0AAA, 0x0AB0, 1},
1336                 {0x0AB2, 0x0AB3, 1},
1337                 {0x0AB5, 0x0AB9, 1},
1338                 {0x0ABD, 0x0AE0, 0x23},
1339                 {0x0B05, 0x0B0C, 1},
1340                 {0x0B0F, 0x0B10, 1},
1341                 {0x0B13, 0x0B28, 1},
1342                 {0x0B2A, 0x0B30, 1},
1343                 {0x0B32, 0x0B33, 1},
1344                 {0x0B36, 0x0B39, 1},
1345                 {0x0B3D, 0x0B3D, 1},
1346                 {0x0B5C, 0x0B5D, 1},
1347                 {0x0B5F, 0x0B61, 1},
1348                 {0x0B85, 0x0B8A, 1},
1349                 {0x0B8E, 0x0B90, 1},
1350                 {0x0B92, 0x0B95, 1},
1351                 {0x0B99, 0x0B9A, 1},
1352                 {0x0B9C, 0x0B9C, 1},
1353                 {0x0B9E, 0x0B9F, 1},
1354                 {0x0BA3, 0x0BA4, 1},
1355                 {0x0BA8, 0x0BAA, 1},
1356                 {0x0BAE, 0x0BB5, 1},
1357                 {0x0BB7, 0x0BB9, 1},
1358                 {0x0C05, 0x0C0C, 1},
1359                 {0x0C0E, 0x0C10, 1},
1360                 {0x0C12, 0x0C28, 1},
1361                 {0x0C2A, 0x0C33, 1},
1362                 {0x0C35, 0x0C39, 1},
1363                 {0x0C60, 0x0C61, 1},
1364                 {0x0C85, 0x0C8C, 1},
1365                 {0x0C8E, 0x0C90, 1},
1366                 {0x0C92, 0x0CA8, 1},
1367                 {0x0CAA, 0x0CB3, 1},
1368                 {0x0CB5, 0x0CB9, 1},
1369                 {0x0CDE, 0x0CDE, 1},
1370                 {0x0CE0, 0x0CE1, 1},
1371                 {0x0D05, 0x0D0C, 1},
1372                 {0x0D0E, 0x0D10, 1},
1373                 {0x0D12, 0x0D28, 1},
1374                 {0x0D2A, 0x0D39, 1},
1375                 {0x0D60, 0x0D61, 1},
1376                 {0x0E01, 0x0E2E, 1},
1377                 {0x0E30, 0x0E30, 1},
1378                 {0x0E32, 0x0E33, 1},
1379                 {0x0E40, 0x0E45, 1},
1380                 {0x0E81, 0x0E82, 1},
1381                 {0x0E84, 0x0E84, 1},
1382                 {0x0E87, 0x0E88, 1},
1383                 {0x0E8A, 0x0E8D, 3},
1384                 {0x0E94, 0x0E97, 1},
1385                 {0x0E99, 0x0E9F, 1},
1386                 {0x0EA1, 0x0EA3, 1},
1387                 {0x0EA5, 0x0EA7, 2},
1388                 {0x0EAA, 0x0EAB, 1},
1389                 {0x0EAD, 0x0EAE, 1},
1390                 {0x0EB0, 0x0EB0, 1},
1391                 {0x0EB2, 0x0EB3, 1},
1392                 {0x0EBD, 0x0EBD, 1},
1393                 {0x0EC0, 0x0EC4, 1},
1394                 {0x0F40, 0x0F47, 1},
1395                 {0x0F49, 0x0F69, 1},
1396                 {0x10A0, 0x10C5, 1},
1397                 {0x10D0, 0x10F6, 1},
1398                 {0x1100, 0x1100, 1},
1399                 {0x1102, 0x1103, 1},
1400                 {0x1105, 0x1107, 1},
1401                 {0x1109, 0x1109, 1},
1402                 {0x110B, 0x110C, 1},
1403                 {0x110E, 0x1112, 1},
1404                 {0x113C, 0x1140, 2},
1405                 {0x114C, 0x1150, 2},
1406                 {0x1154, 0x1155, 1},
1407                 {0x1159, 0x1159, 1},
1408                 {0x115F, 0x1161, 1},
1409                 {0x1163, 0x1169, 2},
1410                 {0x116D, 0x116E, 1},
1411                 {0x1172, 0x1173, 1},
1412                 {0x1175, 0x119E, 0x119E - 0x1175},
1413                 {0x11A8, 0x11AB, 0x11AB - 0x11A8},
1414                 {0x11AE, 0x11AF, 1},
1415                 {0x11B7, 0x11B8, 1},
1416                 {0x11BA, 0x11BA, 1},
1417                 {0x11BC, 0x11C2, 1},
1418                 {0x11EB, 0x11F0, 0x11F0 - 0x11EB},
1419                 {0x11F9, 0x11F9, 1},
1420                 {0x1E00, 0x1E9B, 1},
1421                 {0x1EA0, 0x1EF9, 1},
1422                 {0x1F00, 0x1F15, 1},
1423                 {0x1F18, 0x1F1D, 1},
1424                 {0x1F20, 0x1F45, 1},
1425                 {0x1F48, 0x1F4D, 1},
1426                 {0x1F50, 0x1F57, 1},
1427                 {0x1F59, 0x1F5B, 0x1F5B - 0x1F59},
1428                 {0x1F5D, 0x1F5D, 1},
1429                 {0x1F5F, 0x1F7D, 1},
1430                 {0x1F80, 0x1FB4, 1},
1431                 {0x1FB6, 0x1FBC, 1},
1432                 {0x1FBE, 0x1FBE, 1},
1433                 {0x1FC2, 0x1FC4, 1},
1434                 {0x1FC6, 0x1FCC, 1},
1435                 {0x1FD0, 0x1FD3, 1},
1436                 {0x1FD6, 0x1FDB, 1},
1437                 {0x1FE0, 0x1FEC, 1},
1438                 {0x1FF2, 0x1FF4, 1},
1439                 {0x1FF6, 0x1FFC, 1},
1440                 {0x2126, 0x2126, 1},
1441                 {0x212A, 0x212B, 1},
1442                 {0x212E, 0x212E, 1},
1443                 {0x2180, 0x2182, 1},
1444                 {0x3007, 0x3007, 1},
1445                 {0x3021, 0x3029, 1},
1446                 {0x3041, 0x3094, 1},
1447                 {0x30A1, 0x30FA, 1},
1448                 {0x3105, 0x312C, 1},
1449                 {0x4E00, 0x9FA5, 1},
1450                 {0xAC00, 0xD7A3, 1},
1451         },
1452 }
1453
1454 var second = &unicode.RangeTable{
1455         R16: []unicode.Range16{
1456                 {0x002D, 0x002E, 1},
1457                 {0x0030, 0x0039, 1},
1458                 {0x00B7, 0x00B7, 1},
1459                 {0x02D0, 0x02D1, 1},
1460                 {0x0300, 0x0345, 1},
1461                 {0x0360, 0x0361, 1},
1462                 {0x0387, 0x0387, 1},
1463                 {0x0483, 0x0486, 1},
1464                 {0x0591, 0x05A1, 1},
1465                 {0x05A3, 0x05B9, 1},
1466                 {0x05BB, 0x05BD, 1},
1467                 {0x05BF, 0x05BF, 1},
1468                 {0x05C1, 0x05C2, 1},
1469                 {0x05C4, 0x0640, 0x0640 - 0x05C4},
1470                 {0x064B, 0x0652, 1},
1471                 {0x0660, 0x0669, 1},
1472                 {0x0670, 0x0670, 1},
1473                 {0x06D6, 0x06DC, 1},
1474                 {0x06DD, 0x06DF, 1},
1475                 {0x06E0, 0x06E4, 1},
1476                 {0x06E7, 0x06E8, 1},
1477                 {0x06EA, 0x06ED, 1},
1478                 {0x06F0, 0x06F9, 1},
1479                 {0x0901, 0x0903, 1},
1480                 {0x093C, 0x093C, 1},
1481                 {0x093E, 0x094C, 1},
1482                 {0x094D, 0x094D, 1},
1483                 {0x0951, 0x0954, 1},
1484                 {0x0962, 0x0963, 1},
1485                 {0x0966, 0x096F, 1},
1486                 {0x0981, 0x0983, 1},
1487                 {0x09BC, 0x09BC, 1},
1488                 {0x09BE, 0x09BF, 1},
1489                 {0x09C0, 0x09C4, 1},
1490                 {0x09C7, 0x09C8, 1},
1491                 {0x09CB, 0x09CD, 1},
1492                 {0x09D7, 0x09D7, 1},
1493                 {0x09E2, 0x09E3, 1},
1494                 {0x09E6, 0x09EF, 1},
1495                 {0x0A02, 0x0A3C, 0x3A},
1496                 {0x0A3E, 0x0A3F, 1},
1497                 {0x0A40, 0x0A42, 1},
1498                 {0x0A47, 0x0A48, 1},
1499                 {0x0A4B, 0x0A4D, 1},
1500                 {0x0A66, 0x0A6F, 1},
1501                 {0x0A70, 0x0A71, 1},
1502                 {0x0A81, 0x0A83, 1},
1503                 {0x0ABC, 0x0ABC, 1},
1504                 {0x0ABE, 0x0AC5, 1},
1505                 {0x0AC7, 0x0AC9, 1},
1506                 {0x0ACB, 0x0ACD, 1},
1507                 {0x0AE6, 0x0AEF, 1},
1508                 {0x0B01, 0x0B03, 1},
1509                 {0x0B3C, 0x0B3C, 1},
1510                 {0x0B3E, 0x0B43, 1},
1511                 {0x0B47, 0x0B48, 1},
1512                 {0x0B4B, 0x0B4D, 1},
1513                 {0x0B56, 0x0B57, 1},
1514                 {0x0B66, 0x0B6F, 1},
1515                 {0x0B82, 0x0B83, 1},
1516                 {0x0BBE, 0x0BC2, 1},
1517                 {0x0BC6, 0x0BC8, 1},
1518                 {0x0BCA, 0x0BCD, 1},
1519                 {0x0BD7, 0x0BD7, 1},
1520                 {0x0BE7, 0x0BEF, 1},
1521                 {0x0C01, 0x0C03, 1},
1522                 {0x0C3E, 0x0C44, 1},
1523                 {0x0C46, 0x0C48, 1},
1524                 {0x0C4A, 0x0C4D, 1},
1525                 {0x0C55, 0x0C56, 1},
1526                 {0x0C66, 0x0C6F, 1},
1527                 {0x0C82, 0x0C83, 1},
1528                 {0x0CBE, 0x0CC4, 1},
1529                 {0x0CC6, 0x0CC8, 1},
1530                 {0x0CCA, 0x0CCD, 1},
1531                 {0x0CD5, 0x0CD6, 1},
1532                 {0x0CE6, 0x0CEF, 1},
1533                 {0x0D02, 0x0D03, 1},
1534                 {0x0D3E, 0x0D43, 1},
1535                 {0x0D46, 0x0D48, 1},
1536                 {0x0D4A, 0x0D4D, 1},
1537                 {0x0D57, 0x0D57, 1},
1538                 {0x0D66, 0x0D6F, 1},
1539                 {0x0E31, 0x0E31, 1},
1540                 {0x0E34, 0x0E3A, 1},
1541                 {0x0E46, 0x0E46, 1},
1542                 {0x0E47, 0x0E4E, 1},
1543                 {0x0E50, 0x0E59, 1},
1544                 {0x0EB1, 0x0EB1, 1},
1545                 {0x0EB4, 0x0EB9, 1},
1546                 {0x0EBB, 0x0EBC, 1},
1547                 {0x0EC6, 0x0EC6, 1},
1548                 {0x0EC8, 0x0ECD, 1},
1549                 {0x0ED0, 0x0ED9, 1},
1550                 {0x0F18, 0x0F19, 1},
1551                 {0x0F20, 0x0F29, 1},
1552                 {0x0F35, 0x0F39, 2},
1553                 {0x0F3E, 0x0F3F, 1},
1554                 {0x0F71, 0x0F84, 1},
1555                 {0x0F86, 0x0F8B, 1},
1556                 {0x0F90, 0x0F95, 1},
1557                 {0x0F97, 0x0F97, 1},
1558                 {0x0F99, 0x0FAD, 1},
1559                 {0x0FB1, 0x0FB7, 1},
1560                 {0x0FB9, 0x0FB9, 1},
1561                 {0x20D0, 0x20DC, 1},
1562                 {0x20E1, 0x3005, 0x3005 - 0x20E1},
1563                 {0x302A, 0x302F, 1},
1564                 {0x3031, 0x3035, 1},
1565                 {0x3099, 0x309A, 1},
1566                 {0x309D, 0x309E, 1},
1567                 {0x30FC, 0x30FE, 1},
1568         },
1569 }
1570
1571 // HTMLEntity is an entity map containing translations for the
1572 // standard HTML entity characters.
1573 var HTMLEntity = htmlEntity
1574
1575 var htmlEntity = map[string]string{
1576         /*
1577                 hget http://www.w3.org/TR/html4/sgml/entities.html |
1578                 ssam '
1579                         ,y /\&gt;/ x/\&lt;(.|\n)+/ s/\n/ /g
1580                         ,x v/^\&lt;!ENTITY/d
1581                         ,s/\&lt;!ENTITY ([^ ]+) .*U\+([0-9A-F][0-9A-F][0-9A-F][0-9A-F]) .+/     "\1": "\\u\2",/g
1582                 '
1583         */
1584         "nbsp":     "\u00A0",
1585         "iexcl":    "\u00A1",
1586         "cent":     "\u00A2",
1587         "pound":    "\u00A3",
1588         "curren":   "\u00A4",
1589         "yen":      "\u00A5",
1590         "brvbar":   "\u00A6",
1591         "sect":     "\u00A7",
1592         "uml":      "\u00A8",
1593         "copy":     "\u00A9",
1594         "ordf":     "\u00AA",
1595         "laquo":    "\u00AB",
1596         "not":      "\u00AC",
1597         "shy":      "\u00AD",
1598         "reg":      "\u00AE",
1599         "macr":     "\u00AF",
1600         "deg":      "\u00B0",
1601         "plusmn":   "\u00B1",
1602         "sup2":     "\u00B2",
1603         "sup3":     "\u00B3",
1604         "acute":    "\u00B4",
1605         "micro":    "\u00B5",
1606         "para":     "\u00B6",
1607         "middot":   "\u00B7",
1608         "cedil":    "\u00B8",
1609         "sup1":     "\u00B9",
1610         "ordm":     "\u00BA",
1611         "raquo":    "\u00BB",
1612         "frac14":   "\u00BC",
1613         "frac12":   "\u00BD",
1614         "frac34":   "\u00BE",
1615         "iquest":   "\u00BF",
1616         "Agrave":   "\u00C0",
1617         "Aacute":   "\u00C1",
1618         "Acirc":    "\u00C2",
1619         "Atilde":   "\u00C3",
1620         "Auml":     "\u00C4",
1621         "Aring":    "\u00C5",
1622         "AElig":    "\u00C6",
1623         "Ccedil":   "\u00C7",
1624         "Egrave":   "\u00C8",
1625         "Eacute":   "\u00C9",
1626         "Ecirc":    "\u00CA",
1627         "Euml":     "\u00CB",
1628         "Igrave":   "\u00CC",
1629         "Iacute":   "\u00CD",
1630         "Icirc":    "\u00CE",
1631         "Iuml":     "\u00CF",
1632         "ETH":      "\u00D0",
1633         "Ntilde":   "\u00D1",
1634         "Ograve":   "\u00D2",
1635         "Oacute":   "\u00D3",
1636         "Ocirc":    "\u00D4",
1637         "Otilde":   "\u00D5",
1638         "Ouml":     "\u00D6",
1639         "times":    "\u00D7",
1640         "Oslash":   "\u00D8",
1641         "Ugrave":   "\u00D9",
1642         "Uacute":   "\u00DA",
1643         "Ucirc":    "\u00DB",
1644         "Uuml":     "\u00DC",
1645         "Yacute":   "\u00DD",
1646         "THORN":    "\u00DE",
1647         "szlig":    "\u00DF",
1648         "agrave":   "\u00E0",
1649         "aacute":   "\u00E1",
1650         "acirc":    "\u00E2",
1651         "atilde":   "\u00E3",
1652         "auml":     "\u00E4",
1653         "aring":    "\u00E5",
1654         "aelig":    "\u00E6",
1655         "ccedil":   "\u00E7",
1656         "egrave":   "\u00E8",
1657         "eacute":   "\u00E9",
1658         "ecirc":    "\u00EA",
1659         "euml":     "\u00EB",
1660         "igrave":   "\u00EC",
1661         "iacute":   "\u00ED",
1662         "icirc":    "\u00EE",
1663         "iuml":     "\u00EF",
1664         "eth":      "\u00F0",
1665         "ntilde":   "\u00F1",
1666         "ograve":   "\u00F2",
1667         "oacute":   "\u00F3",
1668         "ocirc":    "\u00F4",
1669         "otilde":   "\u00F5",
1670         "ouml":     "\u00F6",
1671         "divide":   "\u00F7",
1672         "oslash":   "\u00F8",
1673         "ugrave":   "\u00F9",
1674         "uacute":   "\u00FA",
1675         "ucirc":    "\u00FB",
1676         "uuml":     "\u00FC",
1677         "yacute":   "\u00FD",
1678         "thorn":    "\u00FE",
1679         "yuml":     "\u00FF",
1680         "fnof":     "\u0192",
1681         "Alpha":    "\u0391",
1682         "Beta":     "\u0392",
1683         "Gamma":    "\u0393",
1684         "Delta":    "\u0394",
1685         "Epsilon":  "\u0395",
1686         "Zeta":     "\u0396",
1687         "Eta":      "\u0397",
1688         "Theta":    "\u0398",
1689         "Iota":     "\u0399",
1690         "Kappa":    "\u039A",
1691         "Lambda":   "\u039B",
1692         "Mu":       "\u039C",
1693         "Nu":       "\u039D",
1694         "Xi":       "\u039E",
1695         "Omicron":  "\u039F",
1696         "Pi":       "\u03A0",
1697         "Rho":      "\u03A1",
1698         "Sigma":    "\u03A3",
1699         "Tau":      "\u03A4",
1700         "Upsilon":  "\u03A5",
1701         "Phi":      "\u03A6",
1702         "Chi":      "\u03A7",
1703         "Psi":      "\u03A8",
1704         "Omega":    "\u03A9",
1705         "alpha":    "\u03B1",
1706         "beta":     "\u03B2",
1707         "gamma":    "\u03B3",
1708         "delta":    "\u03B4",
1709         "epsilon":  "\u03B5",
1710         "zeta":     "\u03B6",
1711         "eta":      "\u03B7",
1712         "theta":    "\u03B8",
1713         "iota":     "\u03B9",
1714         "kappa":    "\u03BA",
1715         "lambda":   "\u03BB",
1716         "mu":       "\u03BC",
1717         "nu":       "\u03BD",
1718         "xi":       "\u03BE",
1719         "omicron":  "\u03BF",
1720         "pi":       "\u03C0",
1721         "rho":      "\u03C1",
1722         "sigmaf":   "\u03C2",
1723         "sigma":    "\u03C3",
1724         "tau":      "\u03C4",
1725         "upsilon":  "\u03C5",
1726         "phi":      "\u03C6",
1727         "chi":      "\u03C7",
1728         "psi":      "\u03C8",
1729         "omega":    "\u03C9",
1730         "thetasym": "\u03D1",
1731         "upsih":    "\u03D2",
1732         "piv":      "\u03D6",
1733         "bull":     "\u2022",
1734         "hellip":   "\u2026",
1735         "prime":    "\u2032",
1736         "Prime":    "\u2033",
1737         "oline":    "\u203E",
1738         "frasl":    "\u2044",
1739         "weierp":   "\u2118",
1740         "image":    "\u2111",
1741         "real":     "\u211C",
1742         "trade":    "\u2122",
1743         "alefsym":  "\u2135",
1744         "larr":     "\u2190",
1745         "uarr":     "\u2191",
1746         "rarr":     "\u2192",
1747         "darr":     "\u2193",
1748         "harr":     "\u2194",
1749         "crarr":    "\u21B5",
1750         "lArr":     "\u21D0",
1751         "uArr":     "\u21D1",
1752         "rArr":     "\u21D2",
1753         "dArr":     "\u21D3",
1754         "hArr":     "\u21D4",
1755         "forall":   "\u2200",
1756         "part":     "\u2202",
1757         "exist":    "\u2203",
1758         "empty":    "\u2205",
1759         "nabla":    "\u2207",
1760         "isin":     "\u2208",
1761         "notin":    "\u2209",
1762         "ni":       "\u220B",
1763         "prod":     "\u220F",
1764         "sum":      "\u2211",
1765         "minus":    "\u2212",
1766         "lowast":   "\u2217",
1767         "radic":    "\u221A",
1768         "prop":     "\u221D",
1769         "infin":    "\u221E",
1770         "ang":      "\u2220",
1771         "and":      "\u2227",
1772         "or":       "\u2228",
1773         "cap":      "\u2229",
1774         "cup":      "\u222A",
1775         "int":      "\u222B",
1776         "there4":   "\u2234",
1777         "sim":      "\u223C",
1778         "cong":     "\u2245",
1779         "asymp":    "\u2248",
1780         "ne":       "\u2260",
1781         "equiv":    "\u2261",
1782         "le":       "\u2264",
1783         "ge":       "\u2265",
1784         "sub":      "\u2282",
1785         "sup":      "\u2283",
1786         "nsub":     "\u2284",
1787         "sube":     "\u2286",
1788         "supe":     "\u2287",
1789         "oplus":    "\u2295",
1790         "otimes":   "\u2297",
1791         "perp":     "\u22A5",
1792         "sdot":     "\u22C5",
1793         "lceil":    "\u2308",
1794         "rceil":    "\u2309",
1795         "lfloor":   "\u230A",
1796         "rfloor":   "\u230B",
1797         "lang":     "\u2329",
1798         "rang":     "\u232A",
1799         "loz":      "\u25CA",
1800         "spades":   "\u2660",
1801         "clubs":    "\u2663",
1802         "hearts":   "\u2665",
1803         "diams":    "\u2666",
1804         "quot":     "\u0022",
1805         "amp":      "\u0026",
1806         "lt":       "\u003C",
1807         "gt":       "\u003E",
1808         "OElig":    "\u0152",
1809         "oelig":    "\u0153",
1810         "Scaron":   "\u0160",
1811         "scaron":   "\u0161",
1812         "Yuml":     "\u0178",
1813         "circ":     "\u02C6",
1814         "tilde":    "\u02DC",
1815         "ensp":     "\u2002",
1816         "emsp":     "\u2003",
1817         "thinsp":   "\u2009",
1818         "zwnj":     "\u200C",
1819         "zwj":      "\u200D",
1820         "lrm":      "\u200E",
1821         "rlm":      "\u200F",
1822         "ndash":    "\u2013",
1823         "mdash":    "\u2014",
1824         "lsquo":    "\u2018",
1825         "rsquo":    "\u2019",
1826         "sbquo":    "\u201A",
1827         "ldquo":    "\u201C",
1828         "rdquo":    "\u201D",
1829         "bdquo":    "\u201E",
1830         "dagger":   "\u2020",
1831         "Dagger":   "\u2021",
1832         "permil":   "\u2030",
1833         "lsaquo":   "\u2039",
1834         "rsaquo":   "\u203A",
1835         "euro":     "\u20AC",
1836 }
1837
1838 // HTMLAutoClose is the set of HTML elements that
1839 // should be considered to close automatically.
1840 var HTMLAutoClose = htmlAutoClose
1841
1842 var htmlAutoClose = []string{
1843         /*
1844                 hget http://www.w3.org/TR/html4/loose.dtd |
1845                 9 sed -n 's/<!ELEMENT ([^ ]*) +- O EMPTY.+/     "\1",/p' | tr A-Z a-z
1846         */
1847         "basefont",
1848         "br",
1849         "area",
1850         "link",
1851         "img",
1852         "param",
1853         "hr",
1854         "input",
1855         "col",
1856         "frame",
1857         "isindex",
1858         "base",
1859         "meta",
1860 }
1861
1862 var (
1863         esc_quot = []byte("&#34;") // shorter than "&quot;"
1864         esc_apos = []byte("&#39;") // shorter than "&apos;"
1865         esc_amp  = []byte("&amp;")
1866         esc_lt   = []byte("&lt;")
1867         esc_gt   = []byte("&gt;")
1868         esc_tab  = []byte("&#x9;")
1869         esc_nl   = []byte("&#xA;")
1870         esc_cr   = []byte("&#xD;")
1871         esc_fffd = []byte("\uFFFD") // Unicode replacement character
1872 )
1873
1874 // EscapeText writes to w the properly escaped XML equivalent
1875 // of the plain text data s.
1876 func EscapeText(w io.Writer, s []byte) error {
1877         return escapeText(w, s, true)
1878 }
1879
1880 // escapeText writes to w the properly escaped XML equivalent
1881 // of the plain text data s. If escapeNewline is true, newline
1882 // characters will be escaped.
1883 func escapeText(w io.Writer, s []byte, escapeNewline bool) error {
1884         var esc []byte
1885         last := 0
1886         for i := 0; i < len(s); {
1887                 r, width := utf8.DecodeRune(s[i:])
1888                 i += width
1889                 switch r {
1890                 case '"':
1891                         esc = esc_quot
1892                 case '\'':
1893                         esc = esc_apos
1894                 case '&':
1895                         esc = esc_amp
1896                 case '<':
1897                         esc = esc_lt
1898                 case '>':
1899                         esc = esc_gt
1900                 case '\t':
1901                         esc = esc_tab
1902                 case '\n':
1903                         if !escapeNewline {
1904                                 continue
1905                         }
1906                         esc = esc_nl
1907                 case '\r':
1908                         esc = esc_cr
1909                 default:
1910                         if !isInCharacterRange(r) || (r == 0xFFFD && width == 1) {
1911                                 esc = esc_fffd
1912                                 break
1913                         }
1914                         continue
1915                 }
1916                 if _, err := w.Write(s[last : i-width]); err != nil {
1917                         return err
1918                 }
1919                 if _, err := w.Write(esc); err != nil {
1920                         return err
1921                 }
1922                 last = i
1923         }
1924         if _, err := w.Write(s[last:]); err != nil {
1925                 return err
1926         }
1927         return nil
1928 }
1929
1930 // EscapeString writes to p the properly escaped XML equivalent
1931 // of the plain text data s.
1932 func (p *printer) EscapeString(s string) {
1933         var esc []byte
1934         last := 0
1935         for i := 0; i < len(s); {
1936                 r, width := utf8.DecodeRuneInString(s[i:])
1937                 i += width
1938                 switch r {
1939                 case '"':
1940                         esc = esc_quot
1941                 case '\'':
1942                         esc = esc_apos
1943                 case '&':
1944                         esc = esc_amp
1945                 case '<':
1946                         esc = esc_lt
1947                 case '>':
1948                         esc = esc_gt
1949                 case '\t':
1950                         esc = esc_tab
1951                 case '\n':
1952                         esc = esc_nl
1953                 case '\r':
1954                         esc = esc_cr
1955                 default:
1956                         if !isInCharacterRange(r) || (r == 0xFFFD && width == 1) {
1957                                 esc = esc_fffd
1958                                 break
1959                         }
1960                         continue
1961                 }
1962                 p.WriteString(s[last : i-width])
1963                 p.Write(esc)
1964                 last = i
1965         }
1966         p.WriteString(s[last:])
1967 }
1968
1969 // Escape is like EscapeText but omits the error return value.
1970 // It is provided for backwards compatibility with Go 1.0.
1971 // Code targeting Go 1.1 or later should use EscapeText.
1972 func Escape(w io.Writer, s []byte) {
1973         EscapeText(w, s)
1974 }
1975
1976 // procInst parses the `param="..."` or `param='...'`
1977 // value out of the provided string, returning "" if not found.
1978 func procInst(param, s string) string {
1979         // TODO: this parsing is somewhat lame and not exact.
1980         // It works for all actual cases, though.
1981         param = param + "="
1982         idx := strings.Index(s, param)
1983         if idx == -1 {
1984                 return ""
1985         }
1986         v := s[idx+len(param):]
1987         if v == "" {
1988                 return ""
1989         }
1990         if v[0] != '\'' && v[0] != '"' {
1991                 return ""
1992         }
1993         idx = strings.IndexRune(v[1:], rune(v[0]))
1994         if idx == -1 {
1995                 return ""
1996         }
1997         return v[1 : idx+1]
1998 }