OSDN Git Service

new repo
[bytom/vapor.git] / vendor / golang.org / x / net / html / charset / charset.go
1 // Copyright 2013 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 // Package charset provides common text encodings for HTML documents.
6 //
7 // The mapping from encoding labels to encodings is defined at
8 // https://encoding.spec.whatwg.org/.
9 package charset // import "golang.org/x/net/html/charset"
10
11 import (
12         "bytes"
13         "fmt"
14         "io"
15         "mime"
16         "strings"
17         "unicode/utf8"
18
19         "golang.org/x/net/html"
20         "golang.org/x/text/encoding"
21         "golang.org/x/text/encoding/charmap"
22         "golang.org/x/text/encoding/htmlindex"
23         "golang.org/x/text/transform"
24 )
25
26 // Lookup returns the encoding with the specified label, and its canonical
27 // name. It returns nil and the empty string if label is not one of the
28 // standard encodings for HTML. Matching is case-insensitive and ignores
29 // leading and trailing whitespace. Encoders will use HTML escape sequences for
30 // runes that are not supported by the character set.
31 func Lookup(label string) (e encoding.Encoding, name string) {
32         e, err := htmlindex.Get(label)
33         if err != nil {
34                 return nil, ""
35         }
36         name, _ = htmlindex.Name(e)
37         return &htmlEncoding{e}, name
38 }
39
40 type htmlEncoding struct{ encoding.Encoding }
41
42 func (h *htmlEncoding) NewEncoder() *encoding.Encoder {
43         // HTML requires a non-terminating legacy encoder. We use HTML escapes to
44         // substitute unsupported code points.
45         return encoding.HTMLEscapeUnsupported(h.Encoding.NewEncoder())
46 }
47
48 // DetermineEncoding determines the encoding of an HTML document by examining
49 // up to the first 1024 bytes of content and the declared Content-Type.
50 //
51 // See http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#determining-the-character-encoding
52 func DetermineEncoding(content []byte, contentType string) (e encoding.Encoding, name string, certain bool) {
53         if len(content) > 1024 {
54                 content = content[:1024]
55         }
56
57         for _, b := range boms {
58                 if bytes.HasPrefix(content, b.bom) {
59                         e, name = Lookup(b.enc)
60                         return e, name, true
61                 }
62         }
63
64         if _, params, err := mime.ParseMediaType(contentType); err == nil {
65                 if cs, ok := params["charset"]; ok {
66                         if e, name = Lookup(cs); e != nil {
67                                 return e, name, true
68                         }
69                 }
70         }
71
72         if len(content) > 0 {
73                 e, name = prescan(content)
74                 if e != nil {
75                         return e, name, false
76                 }
77         }
78
79         // Try to detect UTF-8.
80         // First eliminate any partial rune at the end.
81         for i := len(content) - 1; i >= 0 && i > len(content)-4; i-- {
82                 b := content[i]
83                 if b < 0x80 {
84                         break
85                 }
86                 if utf8.RuneStart(b) {
87                         content = content[:i]
88                         break
89                 }
90         }
91         hasHighBit := false
92         for _, c := range content {
93                 if c >= 0x80 {
94                         hasHighBit = true
95                         break
96                 }
97         }
98         if hasHighBit && utf8.Valid(content) {
99                 return encoding.Nop, "utf-8", false
100         }
101
102         // TODO: change default depending on user's locale?
103         return charmap.Windows1252, "windows-1252", false
104 }
105
106 // NewReader returns an io.Reader that converts the content of r to UTF-8.
107 // It calls DetermineEncoding to find out what r's encoding is.
108 func NewReader(r io.Reader, contentType string) (io.Reader, error) {
109         preview := make([]byte, 1024)
110         n, err := io.ReadFull(r, preview)
111         switch {
112         case err == io.ErrUnexpectedEOF:
113                 preview = preview[:n]
114                 r = bytes.NewReader(preview)
115         case err != nil:
116                 return nil, err
117         default:
118                 r = io.MultiReader(bytes.NewReader(preview), r)
119         }
120
121         if e, _, _ := DetermineEncoding(preview, contentType); e != encoding.Nop {
122                 r = transform.NewReader(r, e.NewDecoder())
123         }
124         return r, nil
125 }
126
127 // NewReaderLabel returns a reader that converts from the specified charset to
128 // UTF-8. It uses Lookup to find the encoding that corresponds to label, and
129 // returns an error if Lookup returns nil. It is suitable for use as
130 // encoding/xml.Decoder's CharsetReader function.
131 func NewReaderLabel(label string, input io.Reader) (io.Reader, error) {
132         e, _ := Lookup(label)
133         if e == nil {
134                 return nil, fmt.Errorf("unsupported charset: %q", label)
135         }
136         return transform.NewReader(input, e.NewDecoder()), nil
137 }
138
139 func prescan(content []byte) (e encoding.Encoding, name string) {
140         z := html.NewTokenizer(bytes.NewReader(content))
141         for {
142                 switch z.Next() {
143                 case html.ErrorToken:
144                         return nil, ""
145
146                 case html.StartTagToken, html.SelfClosingTagToken:
147                         tagName, hasAttr := z.TagName()
148                         if !bytes.Equal(tagName, []byte("meta")) {
149                                 continue
150                         }
151                         attrList := make(map[string]bool)
152                         gotPragma := false
153
154                         const (
155                                 dontKnow = iota
156                                 doNeedPragma
157                                 doNotNeedPragma
158                         )
159                         needPragma := dontKnow
160
161                         name = ""
162                         e = nil
163                         for hasAttr {
164                                 var key, val []byte
165                                 key, val, hasAttr = z.TagAttr()
166                                 ks := string(key)
167                                 if attrList[ks] {
168                                         continue
169                                 }
170                                 attrList[ks] = true
171                                 for i, c := range val {
172                                         if 'A' <= c && c <= 'Z' {
173                                                 val[i] = c + 0x20
174                                         }
175                                 }
176
177                                 switch ks {
178                                 case "http-equiv":
179                                         if bytes.Equal(val, []byte("content-type")) {
180                                                 gotPragma = true
181                                         }
182
183                                 case "content":
184                                         if e == nil {
185                                                 name = fromMetaElement(string(val))
186                                                 if name != "" {
187                                                         e, name = Lookup(name)
188                                                         if e != nil {
189                                                                 needPragma = doNeedPragma
190                                                         }
191                                                 }
192                                         }
193
194                                 case "charset":
195                                         e, name = Lookup(string(val))
196                                         needPragma = doNotNeedPragma
197                                 }
198                         }
199
200                         if needPragma == dontKnow || needPragma == doNeedPragma && !gotPragma {
201                                 continue
202                         }
203
204                         if strings.HasPrefix(name, "utf-16") {
205                                 name = "utf-8"
206                                 e = encoding.Nop
207                         }
208
209                         if e != nil {
210                                 return e, name
211                         }
212                 }
213         }
214 }
215
216 func fromMetaElement(s string) string {
217         for s != "" {
218                 csLoc := strings.Index(s, "charset")
219                 if csLoc == -1 {
220                         return ""
221                 }
222                 s = s[csLoc+len("charset"):]
223                 s = strings.TrimLeft(s, " \t\n\f\r")
224                 if !strings.HasPrefix(s, "=") {
225                         continue
226                 }
227                 s = s[1:]
228                 s = strings.TrimLeft(s, " \t\n\f\r")
229                 if s == "" {
230                         return ""
231                 }
232                 if q := s[0]; q == '"' || q == '\'' {
233                         s = s[1:]
234                         closeQuote := strings.IndexRune(s, rune(q))
235                         if closeQuote == -1 {
236                                 return ""
237                         }
238                         return s[:closeQuote]
239                 }
240
241                 end := strings.IndexAny(s, "; \t\n\f\r")
242                 if end == -1 {
243                         end = len(s)
244                 }
245                 return s[:end]
246         }
247         return ""
248 }
249
250 var boms = []struct {
251         bom []byte
252         enc string
253 }{
254         {[]byte{0xfe, 0xff}, "utf-16be"},
255         {[]byte{0xff, 0xfe}, "utf-16le"},
256         {[]byte{0xef, 0xbb, 0xbf}, "utf-8"},
257 }