vendor/golang.org/x/net/html/escape.go

   1 // Copyright 2010 The Go Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style
   3 // license that can be found in the LICENSE file.
   4
   5 package html
   6
   7 import (
   8         "bytes"
   9         "strings"
  10         "unicode/utf8"
  11 )
  12
  13 // These replacements permit compatibility with old numeric entities that
  14 // assumed Windows-1252 encoding.
  15 // https://html.spec.whatwg.org/multipage/syntax.html#consume-a-character-reference
  16 var replacementTable = [...]rune{
  17         '\u20AC', // First entry is what 0x80 should be replaced with.
  18         '\u0081',
  19         '\u201A',
  20         '\u0192',
  21         '\u201E',
  22         '\u2026',
  23         '\u2020',
  24         '\u2021',
  25         '\u02C6',
  26         '\u2030',
  27         '\u0160',
  28         '\u2039',
  29         '\u0152',
  30         '\u008D',
  31         '\u017D',
  32         '\u008F',
  33         '\u0090',
  34         '\u2018',
  35         '\u2019',
  36         '\u201C',
  37         '\u201D',
  38         '\u2022',
  39         '\u2013',
  40         '\u2014',
  41         '\u02DC',
  42         '\u2122',
  43         '\u0161',
  44         '\u203A',
  45         '\u0153',
  46         '\u009D',
  47         '\u017E',
  48         '\u0178', // Last entry is 0x9F.
  49         // 0x00->'\uFFFD' is handled programmatically.
  50         // 0x0D->'\u000D' is a no-op.
  51 }
  52
  53 // unescapeEntity reads an entity like "&lt;" from b[src:] and writes the
  54 // corresponding "<" to b[dst:], returning the incremented dst and src cursors.
  55 // Precondition: b[src] == '&' && dst <= src.
  56 // attribute should be true if parsing an attribute value.
  57 func unescapeEntity(b []byte, dst, src int, attribute bool) (dst1, src1 int) {
  58         // https://html.spec.whatwg.org/multipage/syntax.html#consume-a-character-reference
  59
  60         // i starts at 1 because we already know that s[0] == '&'.
  61         i, s := 1, b[src:]
  62
  63         if len(s) <= 1 {
  64                 b[dst] = b[src]
  65                 return dst + 1, src + 1
  66         }
  67
  68         if s[i] == '#' {
  69                 if len(s) <= 3 { // We need to have at least "&#.".
  70                         b[dst] = b[src]
  71                         return dst + 1, src + 1
  72                 }
  73                 i++
  74                 c := s[i]
  75                 hex := false
  76                 if c == 'x' || c == 'X' {
  77                         hex = true
  78                         i++
  79                 }
  80
  81                 x := '\x00'
  82                 for i < len(s) {
  83                         c = s[i]
  84                         i++
  85                         if hex {
  86                                 if '0' <= c && c <= '9' {
  87                                         x = 16*x + rune(c) - '0'
  88                                         continue
  89                                 } else if 'a' <= c && c <= 'f' {
  90                                         x = 16*x + rune(c) - 'a' + 10
  91                                         continue
  92                                 } else if 'A' <= c && c <= 'F' {
  93                                         x = 16*x + rune(c) - 'A' + 10
  94                                         continue
  95                                 }
  96                         } else if '0' <= c && c <= '9' {
  97                                 x = 10*x + rune(c) - '0'
  98                                 continue
  99                         }
 100                         if c != ';' {
 101                                 i--
 102                         }
 103                         break
 104                 }
 105
 106                 if i <= 3 { // No characters matched.
 107                         b[dst] = b[src]
 108                         return dst + 1, src + 1
 109                 }
 110
 111                 if 0x80 <= x && x <= 0x9F {
 112                         // Replace characters from Windows-1252 with UTF-8 equivalents.
 113                         x = replacementTable[x-0x80]
 114                 } else if x == 0 || (0xD800 <= x && x <= 0xDFFF) || x > 0x10FFFF {
 115                         // Replace invalid characters with the replacement character.
 116                         x = '\uFFFD'
 117                 }
 118
 119                 return dst + utf8.EncodeRune(b[dst:], x), src + i
 120         }
 121
 122         // Consume the maximum number of characters possible, with the
 123         // consumed characters matching one of the named references.
 124
 125         for i < len(s) {
 126                 c := s[i]
 127                 i++
 128                 // Lower-cased characters are more common in entities, so we check for them first.
 129                 if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || '0' <= c && c <= '9' {
 130                         continue
 131                 }
 132                 if c != ';' {
 133                         i--
 134                 }
 135                 break
 136         }
 137
 138         entityName := string(s[1:i])
 139         if entityName == "" {
 140                 // No-op.
 141         } else if attribute && entityName[len(entityName)-1] != ';' && len(s) > i && s[i] == '=' {
 142                 // No-op.
 143         } else if x := entity[entityName]; x != 0 {
 144                 return dst + utf8.EncodeRune(b[dst:], x), src + i
 145         } else if x := entity2[entityName]; x[0] != 0 {
 146                 dst1 := dst + utf8.EncodeRune(b[dst:], x[0])
 147                 return dst1 + utf8.EncodeRune(b[dst1:], x[1]), src + i
 148         } else if !attribute {
 149                 maxLen := len(entityName) - 1
 150                 if maxLen > longestEntityWithoutSemicolon {
 151                         maxLen = longestEntityWithoutSemicolon
 152                 }
 153                 for j := maxLen; j > 1; j-- {
 154                         if x := entity[entityName[:j]]; x != 0 {
 155                                 return dst + utf8.EncodeRune(b[dst:], x), src + j + 1
 156                         }
 157                 }
 158         }
 159
 160         dst1, src1 = dst+i, src+i
 161         copy(b[dst:dst1], b[src:src1])
 162         return dst1, src1
 163 }
 164
 165 // unescape unescapes b's entities in-place, so that "a&lt;b" becomes "a<b".
 166 // attribute should be true if parsing an attribute value.
 167 func unescape(b []byte, attribute bool) []byte {
 168         for i, c := range b {
 169                 if c == '&' {
 170                         dst, src := unescapeEntity(b, i, i, attribute)
 171                         for src < len(b) {
 172                                 c := b[src]
 173                                 if c == '&' {
 174                                         dst, src = unescapeEntity(b, dst, src, attribute)
 175                                 } else {
 176                                         b[dst] = c
 177                                         dst, src = dst+1, src+1
 178                                 }
 179                         }
 180                         return b[0:dst]
 181                 }
 182         }
 183         return b
 184 }
 185
 186 // lower lower-cases the A-Z bytes in b in-place, so that "aBc" becomes "abc".
 187 func lower(b []byte) []byte {
 188         for i, c := range b {
 189                 if 'A' <= c && c <= 'Z' {
 190                         b[i] = c + 'a' - 'A'
 191                 }
 192         }
 193         return b
 194 }
 195
 196 const escapedChars = "&'<>\"\r"
 197
 198 func escape(w writer, s string) error {
 199         i := strings.IndexAny(s, escapedChars)
 200         for i != -1 {
 201                 if _, err := w.WriteString(s[:i]); err != nil {
 202                         return err
 203                 }
 204                 var esc string
 205                 switch s[i] {
 206                 case '&':
 207                         esc = "&amp;"
 208                 case '\'':
 209                         // "&#39;" is shorter than "&apos;" and apos was not in HTML until HTML5.
 210                         esc = "&#39;"
 211                 case '<':
 212                         esc = "&lt;"
 213                 case '>':
 214                         esc = "&gt;"
 215                 case '"':
 216                         // "&#34;" is shorter than "&quot;".
 217                         esc = "&#34;"
 218                 case '\r':
 219                         esc = "&#13;"
 220                 default:
 221                         panic("unrecognized escape character")
 222                 }
 223                 s = s[i+1:]
 224                 if _, err := w.WriteString(esc); err != nil {
 225                         return err
 226                 }
 227                 i = strings.IndexAny(s, escapedChars)
 228         }
 229         _, err := w.WriteString(s)
 230         return err
 231 }
 232
 233 // EscapeString escapes special characters like "<" to become "&lt;". It
 234 // escapes only five such characters: <, >, &, ' and ".
 235 // UnescapeString(EscapeString(s)) == s always holds, but the converse isn't
 236 // always true.
 237 func EscapeString(s string) string {
 238         if strings.IndexAny(s, escapedChars) == -1 {
 239                 return s
 240         }
 241         var buf bytes.Buffer
 242         escape(&buf, s)
 243         return buf.String()
 244 }
 245
 246 // UnescapeString unescapes entities like "&lt;" to become "<". It unescapes a
 247 // larger range of entities than EscapeString escapes. For example, "&aacute;"
 248 // unescapes to "á", as does "&#225;" and "&xE1;".
 249 // UnescapeString(EscapeString(s)) == s always holds, but the converse isn't
 250 // always true.
 251 func UnescapeString(s string) string {
 252         for _, c := range s {
 253                 if c == '&' {
 254                         return string(unescape([]byte(s), false))
 255                 }
 256         }
 257         return s
 258 }