vendor/golang.org/x/text/encoding/encoding.go

   1 // Copyright 2013 The Go Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style
   3 // license that can be found in the LICENSE file.
   4
   5 // Package encoding defines an interface for character encodings, such as Shift
   6 // JIS and Windows 1252, that can convert to and from UTF-8.
   7 //
   8 // Encoding implementations are provided in other packages, such as
   9 // golang.org/x/text/encoding/charmap and
  10 // golang.org/x/text/encoding/japanese.
  11 package encoding // import "golang.org/x/text/encoding"
  12
  13 import (
  14         "errors"
  15         "io"
  16         "strconv"
  17         "unicode/utf8"
  18
  19         "golang.org/x/text/encoding/internal/identifier"
  20         "golang.org/x/text/transform"
  21 )
  22
  23 // TODO:
  24 // - There seems to be some inconsistency in when decoders return errors
  25 //   and when not. Also documentation seems to suggest they shouldn't return
  26 //   errors at all (except for UTF-16).
  27 // - Encoders seem to rely on or at least benefit from the input being in NFC
  28 //   normal form. Perhaps add an example how users could prepare their output.
  29
  30 // Encoding is a character set encoding that can be transformed to and from
  31 // UTF-8.
  32 type Encoding interface {
  33         // NewDecoder returns a Decoder.
  34         NewDecoder() *Decoder
  35
  36         // NewEncoder returns an Encoder.
  37         NewEncoder() *Encoder
  38 }
  39
  40 // A Decoder converts bytes to UTF-8. It implements transform.Transformer.
  41 //
  42 // Transforming source bytes that are not of that encoding will not result in an
  43 // error per se. Each byte that cannot be transcoded will be represented in the
  44 // output by the UTF-8 encoding of '\uFFFD', the replacement rune.
  45 type Decoder struct {
  46         transform.Transformer
  47
  48         // This forces external creators of Decoders to use names in struct
  49         // initializers, allowing for future extendibility without having to break
  50         // code.
  51         _ struct{}
  52 }
  53
  54 // Bytes converts the given encoded bytes to UTF-8. It returns the converted
  55 // bytes or nil, err if any error occurred.
  56 func (d *Decoder) Bytes(b []byte) ([]byte, error) {
  57         b, _, err := transform.Bytes(d, b)
  58         if err != nil {
  59                 return nil, err
  60         }
  61         return b, nil
  62 }
  63
  64 // String converts the given encoded string to UTF-8. It returns the converted
  65 // string or "", err if any error occurred.
  66 func (d *Decoder) String(s string) (string, error) {
  67         s, _, err := transform.String(d, s)
  68         if err != nil {
  69                 return "", err
  70         }
  71         return s, nil
  72 }
  73
  74 // Reader wraps another Reader to decode its bytes.
  75 //
  76 // The Decoder may not be used for any other operation as long as the returned
  77 // Reader is in use.
  78 func (d *Decoder) Reader(r io.Reader) io.Reader {
  79         return transform.NewReader(r, d)
  80 }
  81
  82 // An Encoder converts bytes from UTF-8. It implements transform.Transformer.
  83 //
  84 // Each rune that cannot be transcoded will result in an error. In this case,
  85 // the transform will consume all source byte up to, not including the offending
  86 // rune. Transforming source bytes that are not valid UTF-8 will be replaced by
  87 // `\uFFFD`. To return early with an error instead, use transform.Chain to
  88 // preprocess the data with a UTF8Validator.
  89 type Encoder struct {
  90         transform.Transformer
  91
  92         // This forces external creators of Encoders to use names in struct
  93         // initializers, allowing for future extendibility without having to break
  94         // code.
  95         _ struct{}
  96 }
  97
  98 // Bytes converts bytes from UTF-8. It returns the converted bytes or nil, err if
  99 // any error occurred.
 100 func (e *Encoder) Bytes(b []byte) ([]byte, error) {
 101         b, _, err := transform.Bytes(e, b)
 102         if err != nil {
 103                 return nil, err
 104         }
 105         return b, nil
 106 }
 107
 108 // String converts a string from UTF-8. It returns the converted string or
 109 // "", err if any error occurred.
 110 func (e *Encoder) String(s string) (string, error) {
 111         s, _, err := transform.String(e, s)
 112         if err != nil {
 113                 return "", err
 114         }
 115         return s, nil
 116 }
 117
 118 // Writer wraps another Writer to encode its UTF-8 output.
 119 //
 120 // The Encoder may not be used for any other operation as long as the returned
 121 // Writer is in use.
 122 func (e *Encoder) Writer(w io.Writer) io.Writer {
 123         return transform.NewWriter(w, e)
 124 }
 125
 126 // ASCIISub is the ASCII substitute character, as recommended by
 127 // http://unicode.org/reports/tr36/#Text_Comparison
 128 const ASCIISub = '\x1a'
 129
 130 // Nop is the nop encoding. Its transformed bytes are the same as the source
 131 // bytes; it does not replace invalid UTF-8 sequences.
 132 var Nop Encoding = nop{}
 133
 134 type nop struct{}
 135
 136 func (nop) NewDecoder() *Decoder {
 137         return &Decoder{Transformer: transform.Nop}
 138 }
 139 func (nop) NewEncoder() *Encoder {
 140         return &Encoder{Transformer: transform.Nop}
 141 }
 142
 143 // Replacement is the replacement encoding. Decoding from the replacement
 144 // encoding yields a single '\uFFFD' replacement rune. Encoding from UTF-8 to
 145 // the replacement encoding yields the same as the source bytes except that
 146 // invalid UTF-8 is converted to '\uFFFD'.
 147 //
 148 // It is defined at http://encoding.spec.whatwg.org/#replacement
 149 var Replacement Encoding = replacement{}
 150
 151 type replacement struct{}
 152
 153 func (replacement) NewDecoder() *Decoder {
 154         return &Decoder{Transformer: replacementDecoder{}}
 155 }
 156
 157 func (replacement) NewEncoder() *Encoder {
 158         return &Encoder{Transformer: replacementEncoder{}}
 159 }
 160
 161 func (replacement) ID() (mib identifier.MIB, other string) {
 162         return identifier.Replacement, ""
 163 }
 164
 165 type replacementDecoder struct{ transform.NopResetter }
 166
 167 func (replacementDecoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
 168         if len(dst) < 3 {
 169                 return 0, 0, transform.ErrShortDst
 170         }
 171         if atEOF {
 172                 const fffd = "\ufffd"
 173                 dst[0] = fffd[0]
 174                 dst[1] = fffd[1]
 175                 dst[2] = fffd[2]
 176                 nDst = 3
 177         }
 178         return nDst, len(src), nil
 179 }
 180
 181 type replacementEncoder struct{ transform.NopResetter }
 182
 183 func (replacementEncoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
 184         r, size := rune(0), 0
 185
 186         for ; nSrc < len(src); nSrc += size {
 187                 r = rune(src[nSrc])
 188
 189                 // Decode a 1-byte rune.
 190                 if r < utf8.RuneSelf {
 191                         size = 1
 192
 193                 } else {
 194                         // Decode a multi-byte rune.
 195                         r, size = utf8.DecodeRune(src[nSrc:])
 196                         if size == 1 {
 197                                 // All valid runes of size 1 (those below utf8.RuneSelf) were
 198                                 // handled above. We have invalid UTF-8 or we haven't seen the
 199                                 // full character yet.
 200                                 if !atEOF && !utf8.FullRune(src[nSrc:]) {
 201                                         err = transform.ErrShortSrc
 202                                         break
 203                                 }
 204                                 r = '\ufffd'
 205                         }
 206                 }
 207
 208                 if nDst+utf8.RuneLen(r) > len(dst) {
 209                         err = transform.ErrShortDst
 210                         break
 211                 }
 212                 nDst += utf8.EncodeRune(dst[nDst:], r)
 213         }
 214         return nDst, nSrc, err
 215 }
 216
 217 // HTMLEscapeUnsupported wraps encoders to replace source runes outside the
 218 // repertoire of the destination encoding with HTML escape sequences.
 219 //
 220 // This wrapper exists to comply to URL and HTML forms requiring a
 221 // non-terminating legacy encoder. The produced sequences may lead to data
 222 // loss as they are indistinguishable from legitimate input. To avoid this
 223 // issue, use UTF-8 encodings whenever possible.
 224 func HTMLEscapeUnsupported(e *Encoder) *Encoder {
 225         return &Encoder{Transformer: &errorHandler{e, errorToHTML}}
 226 }
 227
 228 // ReplaceUnsupported wraps encoders to replace source runes outside the
 229 // repertoire of the destination encoding with an encoding-specific
 230 // replacement.
 231 //
 232 // This wrapper is only provided for backwards compatibility and legacy
 233 // handling. Its use is strongly discouraged. Use UTF-8 whenever possible.
 234 func ReplaceUnsupported(e *Encoder) *Encoder {
 235         return &Encoder{Transformer: &errorHandler{e, errorToReplacement}}
 236 }
 237
 238 type errorHandler struct {
 239         *Encoder
 240         handler func(dst []byte, r rune, err repertoireError) (n int, ok bool)
 241 }
 242
 243 // TODO: consider making this error public in some form.
 244 type repertoireError interface {
 245         Replacement() byte
 246 }
 247
 248 func (h errorHandler) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
 249         nDst, nSrc, err = h.Transformer.Transform(dst, src, atEOF)
 250         for err != nil {
 251                 rerr, ok := err.(repertoireError)
 252                 if !ok {
 253                         return nDst, nSrc, err
 254                 }
 255                 r, sz := utf8.DecodeRune(src[nSrc:])
 256                 n, ok := h.handler(dst[nDst:], r, rerr)
 257                 if !ok {
 258                         return nDst, nSrc, transform.ErrShortDst
 259                 }
 260                 err = nil
 261                 nDst += n
 262                 if nSrc += sz; nSrc < len(src) {
 263                         var dn, sn int
 264                         dn, sn, err = h.Transformer.Transform(dst[nDst:], src[nSrc:], atEOF)
 265                         nDst += dn
 266                         nSrc += sn
 267                 }
 268         }
 269         return nDst, nSrc, err
 270 }
 271
 272 func errorToHTML(dst []byte, r rune, err repertoireError) (n int, ok bool) {
 273         buf := [8]byte{}
 274         b := strconv.AppendUint(buf[:0], uint64(r), 10)
 275         if n = len(b) + len("&#;"); n >= len(dst) {
 276                 return 0, false
 277         }
 278         dst[0] = '&'
 279         dst[1] = '#'
 280         dst[copy(dst[2:], b)+2] = ';'
 281         return n, true
 282 }
 283
 284 func errorToReplacement(dst []byte, r rune, err repertoireError) (n int, ok bool) {
 285         if len(dst) == 0 {
 286                 return 0, false
 287         }
 288         dst[0] = err.Replacement()
 289         return 1, true
 290 }
 291
 292 // ErrInvalidUTF8 means that a transformer encountered invalid UTF-8.
 293 var ErrInvalidUTF8 = errors.New("encoding: invalid UTF-8")
 294
 295 // UTF8Validator is a transformer that returns ErrInvalidUTF8 on the first
 296 // input byte that is not valid UTF-8.
 297 var UTF8Validator transform.Transformer = utf8Validator{}
 298
 299 type utf8Validator struct{ transform.NopResetter }
 300
 301 func (utf8Validator) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
 302         n := len(src)
 303         if n > len(dst) {
 304                 n = len(dst)
 305         }
 306         for i := 0; i < n; {
 307                 if c := src[i]; c < utf8.RuneSelf {
 308                         dst[i] = c
 309                         i++
 310                         continue
 311                 }
 312                 _, size := utf8.DecodeRune(src[i:])
 313                 if size == 1 {
 314                         // All valid runes of size 1 (those below utf8.RuneSelf) were
 315                         // handled above. We have invalid UTF-8 or we haven't seen the
 316                         // full character yet.
 317                         err = ErrInvalidUTF8
 318                         if !atEOF && !utf8.FullRune(src[i:]) {
 319                                 err = transform.ErrShortSrc
 320                         }
 321                         return i, i, err
 322                 }
 323                 if i+size > len(dst) {
 324                         return i, i, transform.ErrShortDst
 325                 }
 326                 for ; size > 0; size-- {
 327                         dst[i] = src[i]
 328                         i++
 329                 }
 330         }
 331         if len(src) > len(dst) {
 332                 err = transform.ErrShortDst
 333         }
 334         return n, n, err
 335 }