1 // Copyright 2013 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
5 package simplifiedchinese
10 "golang.org/x/text/encoding"
11 "golang.org/x/text/encoding/internal"
12 "golang.org/x/text/encoding/internal/identifier"
13 "golang.org/x/text/transform"
17 // GB18030 is the GB18030 encoding.
18 GB18030 encoding.Encoding = &gbk18030
19 // GBK is the GBK encoding. It encodes an extension of the GB2312 character set
20 // and is also known as Code Page 936.
21 GBK encoding.Encoding = &gbk
24 var gbk = internal.Encoding{
25 &internal.SimpleEncoding{
26 gbkDecoder{gb18030: false},
27 gbkEncoder{gb18030: false},
33 var gbk18030 = internal.Encoding{
34 &internal.SimpleEncoding{
35 gbkDecoder{gb18030: true},
36 gbkEncoder{gb18030: true},
42 type gbkDecoder struct {
47 func (d gbkDecoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
50 for ; nSrc < len(src); nSrc += size {
51 switch c0 := src[nSrc]; {
52 case c0 < utf8.RuneSelf:
55 // Microsoft's Code Page 936 extends GBK 1.0 to encode the euro sign U+20AC
56 // as 0x80. The HTML5 specification at http://encoding.spec.whatwg.org/#gbk
57 // says to treat "gbk" as Code Page 936.
62 if nSrc+1 >= len(src) {
64 err = transform.ErrShortSrc
67 r, size = utf8.RuneError, 1
72 case 0x40 <= c1 && c1 < 0x7f:
74 case 0x80 <= c1 && c1 < 0xff:
76 case d.gb18030 && 0x30 <= c1 && c1 < 0x40:
77 if nSrc+3 >= len(src) {
79 err = transform.ErrShortSrc
82 // The second byte here is always ASCII, so we can set size
84 r, size = utf8.RuneError, 1
88 if c2 < 0x81 || 0xff <= c2 {
89 r, size = utf8.RuneError, 1
93 if c3 < 0x30 || 0x3a <= c3 {
94 r, size = utf8.RuneError, 1
98 r = ((rune(c0-0x81)*10+rune(c1-0x30))*126+rune(c2-0x81))*10 + rune(c3-0x30)
100 i, j := 0, len(gb18030)
103 if r >= rune(gb18030[h][0]) {
110 r += rune(dec[1]) - rune(dec[0])
114 if 0 <= r && r < 0x100000 {
117 r, size = utf8.RuneError, 1
121 r, size = utf8.RuneError, 1
124 r, size = '\ufffd', 2
125 if i := int(c0-0x81)*190 + int(c1); i < len(decode) {
133 r, size = utf8.RuneError, 1
137 if nDst+utf8.RuneLen(r) > len(dst) {
138 err = transform.ErrShortDst
141 nDst += utf8.EncodeRune(dst[nDst:], r)
143 return nDst, nSrc, err
146 type gbkEncoder struct {
147 transform.NopResetter
151 func (e gbkEncoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
152 r, r2, size := rune(0), rune(0), 0
153 for ; nSrc < len(src); nSrc += size {
156 // Decode a 1-byte rune.
157 if r < utf8.RuneSelf {
161 // Decode a multi-byte rune.
162 r, size = utf8.DecodeRune(src[nSrc:])
164 // All valid runes of size 1 (those below utf8.RuneSelf) were
165 // handled above. We have invalid UTF-8 or we haven't seen the
166 // full character yet.
167 if !atEOF && !utf8.FullRune(src[nSrc:]) {
168 err = transform.ErrShortSrc
173 // func init checks that the switch covers all tables.
175 case encode0Low <= r && r < encode0High:
176 if r2 = rune(encode0[r-encode0Low]); r2 != 0 {
179 case encode1Low <= r && r < encode1High:
180 // Microsoft's Code Page 936 extends GBK 1.0 to encode the euro sign U+20AC
181 // as 0x80. The HTML5 specification at http://encoding.spec.whatwg.org/#gbk
182 // says to treat "gbk" as Code Page 936.
187 if r2 = rune(encode1[r-encode1Low]); r2 != 0 {
190 case encode2Low <= r && r < encode2High:
191 if r2 = rune(encode2[r-encode2Low]); r2 != 0 {
194 case encode3Low <= r && r < encode3High:
195 if r2 = rune(encode3[r-encode3Low]); r2 != 0 {
198 case encode4Low <= r && r < encode4High:
199 if r2 = rune(encode4[r-encode4Low]); r2 != 0 {
206 i, j := 0, len(gb18030)
209 if r >= rune(gb18030[h][1]) {
216 r += rune(dec[0]) - rune(dec[1])
218 } else if r < 0x110000 {
219 r += 189000 - 0x10000
223 err = internal.ErrASCIIReplacement
228 if nDst >= len(dst) {
229 err = transform.ErrShortDst
237 if nDst+2 > len(dst) {
238 err = transform.ErrShortDst
241 dst[nDst+0] = uint8(r2 >> 8)
242 dst[nDst+1] = uint8(r2)
247 if nDst+4 > len(dst) {
248 err = transform.ErrShortDst
251 dst[nDst+3] = uint8(r%10 + 0x30)
253 dst[nDst+2] = uint8(r%126 + 0x81)
255 dst[nDst+1] = uint8(r%10 + 0x30)
257 dst[nDst+0] = uint8(r + 0x81)
261 return nDst, nSrc, err
265 // Check that the hard-coded encode switch covers all tables.
266 if numEncodeTables != 5 {
267 panic("bad numEncodeTables")