vendor/golang.org/x/text/encoding/unicode/utf32/utf32.go

   1 // Copyright 2016 The Go Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style
   3 // license that can be found in the LICENSE file.
   4
   5 // Package utf32 provides the UTF-32 Unicode encoding.
   6 //
   7 // Please note that support for UTF-32 is discouraged as it is a rare and
   8 // inefficient encoding, unfit for use as an interchange format. For use
   9 // on the web, the W3C strongly discourages its use
  10 // (https://www.w3.org/TR/html5/document-metadata.html#charset)
  11 // while WHATWG directly prohibits supporting it
  12 // (https://html.spec.whatwg.org/multipage/syntax.html#character-encodings).
  13 package utf32 // import "golang.org/x/text/encoding/unicode/utf32"
  14
  15 import (
  16         "errors"
  17         "unicode/utf8"
  18
  19         "golang.org/x/text/encoding"
  20         "golang.org/x/text/encoding/internal/identifier"
  21         "golang.org/x/text/transform"
  22 )
  23
  24 // All lists a configuration for each IANA-defined UTF-32 variant.
  25 var All = []encoding.Encoding{
  26         UTF32(BigEndian, UseBOM),
  27         UTF32(BigEndian, IgnoreBOM),
  28         UTF32(LittleEndian, IgnoreBOM),
  29 }
  30
  31 // ErrMissingBOM means that decoding UTF-32 input with ExpectBOM did not
  32 // find a starting byte order mark.
  33 var ErrMissingBOM = errors.New("encoding: missing byte order mark")
  34
  35 // UTF32 returns a UTF-32 Encoding for the given default endianness and
  36 // byte order mark (BOM) policy.
  37 //
  38 // When decoding from UTF-32 to UTF-8, if the BOMPolicy is IgnoreBOM then
  39 // neither BOMs U+FEFF nor ill-formed code units 0xFFFE0000 in the input
  40 // stream will affect the endianness used for decoding. Instead BOMs will
  41 // be output as their standard UTF-8 encoding "\xef\xbb\xbf" while
  42 // 0xFFFE0000 code units will be output as "\xef\xbf\xbd", the standard
  43 // UTF-8 encoding for the Unicode replacement character. If the BOMPolicy
  44 // is UseBOM or ExpectBOM a starting BOM is not written to the UTF-8
  45 // output. Instead, it overrides the default endianness e for the remainder
  46 // of the transformation. Any subsequent BOMs U+FEFF or ill-formed code
  47 // units 0xFFFE0000 will not affect the endianness used, and will instead
  48 // be output as their standard UTF-8 (replacement) encodings. For UseBOM,
  49 // if there is no starting BOM, it will proceed with the default
  50 // Endianness. For ExpectBOM, in that case, the transformation will return
  51 // early with an ErrMissingBOM error.
  52 //
  53 // When encoding from UTF-8 to UTF-32, a BOM will be inserted at the start
  54 // of the output if the BOMPolicy is UseBOM or ExpectBOM. Otherwise, a BOM
  55 // will not be inserted. The UTF-8 input does not need to contain a BOM.
  56 //
  57 // There is no concept of a 'native' endianness. If the UTF-32 data is
  58 // produced and consumed in a greater context that implies a certain
  59 // endianness, use IgnoreBOM. Otherwise, use ExpectBOM and always produce
  60 // and consume a BOM.
  61 //
  62 // In the language of http://www.unicode.org/faq/utf_bom.html#bom10,
  63 // IgnoreBOM corresponds to "Where the precise type of the data stream is
  64 // known... the BOM should not be used" and ExpectBOM corresponds to "A
  65 // particular protocol... may require use of the BOM".
  66 func UTF32(e Endianness, b BOMPolicy) encoding.Encoding {
  67         return utf32Encoding{config{e, b}, mibValue[e][b&bomMask]}
  68 }
  69
  70 // mibValue maps Endianness and BOMPolicy settings to MIB constants for UTF-32.
  71 // Note that some configurations map to the same MIB identifier.
  72 var mibValue = map[Endianness][numBOMValues]identifier.MIB{
  73         BigEndian: [numBOMValues]identifier.MIB{
  74                 IgnoreBOM: identifier.UTF32BE,
  75                 UseBOM:    identifier.UTF32,
  76         },
  77         LittleEndian: [numBOMValues]identifier.MIB{
  78                 IgnoreBOM: identifier.UTF32LE,
  79                 UseBOM:    identifier.UTF32,
  80         },
  81         // ExpectBOM is not widely used and has no valid MIB identifier.
  82 }
  83
  84 // BOMPolicy is a UTF-32 encodings's byte order mark policy.
  85 type BOMPolicy uint8
  86
  87 const (
  88         writeBOM   BOMPolicy = 0x01
  89         acceptBOM  BOMPolicy = 0x02
  90         requireBOM BOMPolicy = 0x04
  91         bomMask    BOMPolicy = 0x07
  92
  93         // HACK: numBOMValues == 8 triggers a bug in the 1.4 compiler (cannot have a
  94         // map of an array of length 8 of a type that is also used as a key or value
  95         // in another map). See golang.org/issue/11354.
  96         // TODO: consider changing this value back to 8 if the use of 1.4.* has
  97         // been minimized.
  98         numBOMValues = 8 + 1
  99
 100         // IgnoreBOM means to ignore any byte order marks.
 101         IgnoreBOM BOMPolicy = 0
 102         // Unicode-compliant interpretation for UTF-32BE/LE.
 103
 104         // UseBOM means that the UTF-32 form may start with a byte order mark,
 105         // which will be used to override the default encoding.
 106         UseBOM BOMPolicy = writeBOM | acceptBOM
 107         // Unicode-compliant interpretation for UTF-32.
 108
 109         // ExpectBOM means that the UTF-32 form must start with a byte order mark,
 110         // which will be used to override the default encoding.
 111         ExpectBOM BOMPolicy = writeBOM | acceptBOM | requireBOM
 112         // Consistent with BOMPolicy definition in golang.org/x/text/encoding/unicode
 113 )
 114
 115 // Endianness is a UTF-32 encoding's default endianness.
 116 type Endianness bool
 117
 118 const (
 119         // BigEndian is UTF-32BE.
 120         BigEndian Endianness = false
 121         // LittleEndian is UTF-32LE.
 122         LittleEndian Endianness = true
 123 )
 124
 125 type config struct {
 126         endianness Endianness
 127         bomPolicy  BOMPolicy
 128 }
 129
 130 type utf32Encoding struct {
 131         config
 132         mib identifier.MIB
 133 }
 134
 135 func (u utf32Encoding) NewDecoder() *encoding.Decoder {
 136         return &encoding.Decoder{Transformer: &utf32Decoder{
 137                 initial: u.config,
 138                 current: u.config,
 139         }}
 140 }
 141
 142 func (u utf32Encoding) NewEncoder() *encoding.Encoder {
 143         return &encoding.Encoder{Transformer: &utf32Encoder{
 144                 endianness:       u.endianness,
 145                 initialBOMPolicy: u.bomPolicy,
 146                 currentBOMPolicy: u.bomPolicy,
 147         }}
 148 }
 149
 150 func (u utf32Encoding) ID() (mib identifier.MIB, other string) {
 151         return u.mib, ""
 152 }
 153
 154 func (u utf32Encoding) String() string {
 155         e, b := "B", ""
 156         if u.endianness == LittleEndian {
 157                 e = "L"
 158         }
 159         switch u.bomPolicy {
 160         case ExpectBOM:
 161                 b = "Expect"
 162         case UseBOM:
 163                 b = "Use"
 164         case IgnoreBOM:
 165                 b = "Ignore"
 166         }
 167         return "UTF-32" + e + "E (" + b + " BOM)"
 168 }
 169
 170 type utf32Decoder struct {
 171         initial config
 172         current config
 173 }
 174
 175 func (u *utf32Decoder) Reset() {
 176         u.current = u.initial
 177 }
 178
 179 func (u *utf32Decoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
 180         if len(src) == 0 {
 181                 if atEOF && u.current.bomPolicy&requireBOM != 0 {
 182                         return 0, 0, ErrMissingBOM
 183                 }
 184                 return 0, 0, nil
 185         }
 186         if u.current.bomPolicy&acceptBOM != 0 {
 187                 if len(src) < 4 {
 188                         return 0, 0, transform.ErrShortSrc
 189                 }
 190                 switch {
 191                 case src[0] == 0x00 && src[1] == 0x00 && src[2] == 0xfe && src[3] == 0xff:
 192                         u.current.endianness = BigEndian
 193                         nSrc = 4
 194                 case src[0] == 0xff && src[1] == 0xfe && src[2] == 0x00 && src[3] == 0x00:
 195                         u.current.endianness = LittleEndian
 196                         nSrc = 4
 197                 default:
 198                         if u.current.bomPolicy&requireBOM != 0 {
 199                                 return 0, 0, ErrMissingBOM
 200                         }
 201                 }
 202                 u.current.bomPolicy = IgnoreBOM
 203         }
 204
 205         var r rune
 206         var dSize, sSize int
 207         for nSrc < len(src) {
 208                 if nSrc+3 < len(src) {
 209                         x := uint32(src[nSrc+0])<<24 | uint32(src[nSrc+1])<<16 |
 210                                 uint32(src[nSrc+2])<<8 | uint32(src[nSrc+3])
 211                         if u.current.endianness == LittleEndian {
 212                                 x = x>>24 | (x >> 8 & 0x0000FF00) | (x << 8 & 0x00FF0000) | x<<24
 213                         }
 214                         r, sSize = rune(x), 4
 215                         if dSize = utf8.RuneLen(r); dSize < 0 {
 216                                 r, dSize = utf8.RuneError, 3
 217                         }
 218                 } else if atEOF {
 219                         // 1..3 trailing bytes.
 220                         r, dSize, sSize = utf8.RuneError, 3, len(src)-nSrc
 221                 } else {
 222                         err = transform.ErrShortSrc
 223                         break
 224                 }
 225                 if nDst+dSize > len(dst) {
 226                         err = transform.ErrShortDst
 227                         break
 228                 }
 229                 nDst += utf8.EncodeRune(dst[nDst:], r)
 230                 nSrc += sSize
 231         }
 232         return nDst, nSrc, err
 233 }
 234
 235 type utf32Encoder struct {
 236         endianness       Endianness
 237         initialBOMPolicy BOMPolicy
 238         currentBOMPolicy BOMPolicy
 239 }
 240
 241 func (u *utf32Encoder) Reset() {
 242         u.currentBOMPolicy = u.initialBOMPolicy
 243 }
 244
 245 func (u *utf32Encoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
 246         if u.currentBOMPolicy&writeBOM != 0 {
 247                 if len(dst) < 4 {
 248                         return 0, 0, transform.ErrShortDst
 249                 }
 250                 dst[0], dst[1], dst[2], dst[3] = 0x00, 0x00, 0xfe, 0xff
 251                 u.currentBOMPolicy = IgnoreBOM
 252                 nDst = 4
 253         }
 254
 255         r, size := rune(0), 0
 256         for nSrc < len(src) {
 257                 r = rune(src[nSrc])
 258
 259                 // Decode a 1-byte rune.
 260                 if r < utf8.RuneSelf {
 261                         size = 1
 262
 263                 } else {
 264                         // Decode a multi-byte rune.
 265                         r, size = utf8.DecodeRune(src[nSrc:])
 266                         if size == 1 {
 267                                 // All valid runes of size 1 (those below utf8.RuneSelf) were
 268                                 // handled above. We have invalid UTF-8 or we haven't seen the
 269                                 // full character yet.
 270                                 if !atEOF && !utf8.FullRune(src[nSrc:]) {
 271                                         err = transform.ErrShortSrc
 272                                         break
 273                                 }
 274                         }
 275                 }
 276
 277                 if nDst+4 > len(dst) {
 278                         err = transform.ErrShortDst
 279                         break
 280                 }
 281
 282                 dst[nDst+0] = uint8(r >> 24)
 283                 dst[nDst+1] = uint8(r >> 16)
 284                 dst[nDst+2] = uint8(r >> 8)
 285                 dst[nDst+3] = uint8(r)
 286                 nDst += 4
 287                 nSrc += size
 288         }
 289
 290         if u.endianness == LittleEndian {
 291                 for i := 0; i < nDst; i += 4 {
 292                         dst[i], dst[i+1], dst[i+2], dst[i+3] = dst[i+3], dst[i+2], dst[i+1], dst[i]
 293                 }
 294         }
 295         return nDst, nSrc, err
 296 }