OSDN Git Service

new repo
[bytom/vapor.git] / vendor / golang.org / x / text / encoding / charmap / maketables.go
1 // Copyright 2013 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 // +build ignore
6
7 package main
8
9 import (
10         "bufio"
11         "fmt"
12         "log"
13         "net/http"
14         "sort"
15         "strings"
16         "unicode/utf8"
17
18         "golang.org/x/text/encoding"
19         "golang.org/x/text/internal/gen"
20 )
21
22 const ascii = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f" +
23         "\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f" +
24         ` !"#$%&'()*+,-./0123456789:;<=>?` +
25         `@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_` +
26         "`abcdefghijklmnopqrstuvwxyz{|}~\u007f"
27
28 var encodings = []struct {
29         name        string
30         mib         string
31         comment     string
32         varName     string
33         replacement byte
34         mapping     string
35 }{
36         {
37                 "IBM Code Page 037",
38                 "IBM037",
39                 "",
40                 "CodePage037",
41                 0x3f,
42                 "http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM037-2.1.2.ucm",
43         },
44         {
45                 "IBM Code Page 437",
46                 "PC8CodePage437",
47                 "",
48                 "CodePage437",
49                 encoding.ASCIISub,
50                 "http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM437-2.1.2.ucm",
51         },
52         {
53                 "IBM Code Page 850",
54                 "PC850Multilingual",
55                 "",
56                 "CodePage850",
57                 encoding.ASCIISub,
58                 "http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM850-2.1.2.ucm",
59         },
60         {
61                 "IBM Code Page 852",
62                 "PCp852",
63                 "",
64                 "CodePage852",
65                 encoding.ASCIISub,
66                 "http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM852-2.1.2.ucm",
67         },
68         {
69                 "IBM Code Page 855",
70                 "IBM855",
71                 "",
72                 "CodePage855",
73                 encoding.ASCIISub,
74                 "http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM855-2.1.2.ucm",
75         },
76         {
77                 "Windows Code Page 858", // PC latin1 with Euro
78                 "IBM00858",
79                 "",
80                 "CodePage858",
81                 encoding.ASCIISub,
82                 "http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/windows-858-2000.ucm",
83         },
84         {
85                 "IBM Code Page 860",
86                 "IBM860",
87                 "",
88                 "CodePage860",
89                 encoding.ASCIISub,
90                 "http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM860-2.1.2.ucm",
91         },
92         {
93                 "IBM Code Page 862",
94                 "PC862LatinHebrew",
95                 "",
96                 "CodePage862",
97                 encoding.ASCIISub,
98                 "http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM862-2.1.2.ucm",
99         },
100         {
101                 "IBM Code Page 863",
102                 "IBM863",
103                 "",
104                 "CodePage863",
105                 encoding.ASCIISub,
106                 "http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM863-2.1.2.ucm",
107         },
108         {
109                 "IBM Code Page 865",
110                 "IBM865",
111                 "",
112                 "CodePage865",
113                 encoding.ASCIISub,
114                 "http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM865-2.1.2.ucm",
115         },
116         {
117                 "IBM Code Page 866",
118                 "IBM866",
119                 "",
120                 "CodePage866",
121                 encoding.ASCIISub,
122                 "http://encoding.spec.whatwg.org/index-ibm866.txt",
123         },
124         {
125                 "IBM Code Page 1047",
126                 "IBM1047",
127                 "",
128                 "CodePage1047",
129                 0x3f,
130                 "http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM1047-2.1.2.ucm",
131         },
132         {
133                 "IBM Code Page 1140",
134                 "IBM01140",
135                 "",
136                 "CodePage1140",
137                 0x3f,
138                 "http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/ibm-1140_P100-1997.ucm",
139         },
140         {
141                 "ISO 8859-1",
142                 "ISOLatin1",
143                 "",
144                 "ISO8859_1",
145                 encoding.ASCIISub,
146                 "http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/iso-8859_1-1998.ucm",
147         },
148         {
149                 "ISO 8859-2",
150                 "ISOLatin2",
151                 "",
152                 "ISO8859_2",
153                 encoding.ASCIISub,
154                 "http://encoding.spec.whatwg.org/index-iso-8859-2.txt",
155         },
156         {
157                 "ISO 8859-3",
158                 "ISOLatin3",
159                 "",
160                 "ISO8859_3",
161                 encoding.ASCIISub,
162                 "http://encoding.spec.whatwg.org/index-iso-8859-3.txt",
163         },
164         {
165                 "ISO 8859-4",
166                 "ISOLatin4",
167                 "",
168                 "ISO8859_4",
169                 encoding.ASCIISub,
170                 "http://encoding.spec.whatwg.org/index-iso-8859-4.txt",
171         },
172         {
173                 "ISO 8859-5",
174                 "ISOLatinCyrillic",
175                 "",
176                 "ISO8859_5",
177                 encoding.ASCIISub,
178                 "http://encoding.spec.whatwg.org/index-iso-8859-5.txt",
179         },
180         {
181                 "ISO 8859-6",
182                 "ISOLatinArabic",
183                 "",
184                 "ISO8859_6,ISO8859_6E,ISO8859_6I",
185                 encoding.ASCIISub,
186                 "http://encoding.spec.whatwg.org/index-iso-8859-6.txt",
187         },
188         {
189                 "ISO 8859-7",
190                 "ISOLatinGreek",
191                 "",
192                 "ISO8859_7",
193                 encoding.ASCIISub,
194                 "http://encoding.spec.whatwg.org/index-iso-8859-7.txt",
195         },
196         {
197                 "ISO 8859-8",
198                 "ISOLatinHebrew",
199                 "",
200                 "ISO8859_8,ISO8859_8E,ISO8859_8I",
201                 encoding.ASCIISub,
202                 "http://encoding.spec.whatwg.org/index-iso-8859-8.txt",
203         },
204         {
205                 "ISO 8859-9",
206                 "ISOLatin5",
207                 "",
208                 "ISO8859_9",
209                 encoding.ASCIISub,
210                 "http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/iso-8859_9-1999.ucm",
211         },
212         {
213                 "ISO 8859-10",
214                 "ISOLatin6",
215                 "",
216                 "ISO8859_10",
217                 encoding.ASCIISub,
218                 "http://encoding.spec.whatwg.org/index-iso-8859-10.txt",
219         },
220         {
221                 "ISO 8859-13",
222                 "ISO885913",
223                 "",
224                 "ISO8859_13",
225                 encoding.ASCIISub,
226                 "http://encoding.spec.whatwg.org/index-iso-8859-13.txt",
227         },
228         {
229                 "ISO 8859-14",
230                 "ISO885914",
231                 "",
232                 "ISO8859_14",
233                 encoding.ASCIISub,
234                 "http://encoding.spec.whatwg.org/index-iso-8859-14.txt",
235         },
236         {
237                 "ISO 8859-15",
238                 "ISO885915",
239                 "",
240                 "ISO8859_15",
241                 encoding.ASCIISub,
242                 "http://encoding.spec.whatwg.org/index-iso-8859-15.txt",
243         },
244         {
245                 "ISO 8859-16",
246                 "ISO885916",
247                 "",
248                 "ISO8859_16",
249                 encoding.ASCIISub,
250                 "http://encoding.spec.whatwg.org/index-iso-8859-16.txt",
251         },
252         {
253                 "KOI8-R",
254                 "KOI8R",
255                 "",
256                 "KOI8R",
257                 encoding.ASCIISub,
258                 "http://encoding.spec.whatwg.org/index-koi8-r.txt",
259         },
260         {
261                 "KOI8-U",
262                 "KOI8U",
263                 "",
264                 "KOI8U",
265                 encoding.ASCIISub,
266                 "http://encoding.spec.whatwg.org/index-koi8-u.txt",
267         },
268         {
269                 "Macintosh",
270                 "Macintosh",
271                 "",
272                 "Macintosh",
273                 encoding.ASCIISub,
274                 "http://encoding.spec.whatwg.org/index-macintosh.txt",
275         },
276         {
277                 "Macintosh Cyrillic",
278                 "MacintoshCyrillic",
279                 "",
280                 "MacintoshCyrillic",
281                 encoding.ASCIISub,
282                 "http://encoding.spec.whatwg.org/index-x-mac-cyrillic.txt",
283         },
284         {
285                 "Windows 874",
286                 "Windows874",
287                 "",
288                 "Windows874",
289                 encoding.ASCIISub,
290                 "http://encoding.spec.whatwg.org/index-windows-874.txt",
291         },
292         {
293                 "Windows 1250",
294                 "Windows1250",
295                 "",
296                 "Windows1250",
297                 encoding.ASCIISub,
298                 "http://encoding.spec.whatwg.org/index-windows-1250.txt",
299         },
300         {
301                 "Windows 1251",
302                 "Windows1251",
303                 "",
304                 "Windows1251",
305                 encoding.ASCIISub,
306                 "http://encoding.spec.whatwg.org/index-windows-1251.txt",
307         },
308         {
309                 "Windows 1252",
310                 "Windows1252",
311                 "",
312                 "Windows1252",
313                 encoding.ASCIISub,
314                 "http://encoding.spec.whatwg.org/index-windows-1252.txt",
315         },
316         {
317                 "Windows 1253",
318                 "Windows1253",
319                 "",
320                 "Windows1253",
321                 encoding.ASCIISub,
322                 "http://encoding.spec.whatwg.org/index-windows-1253.txt",
323         },
324         {
325                 "Windows 1254",
326                 "Windows1254",
327                 "",
328                 "Windows1254",
329                 encoding.ASCIISub,
330                 "http://encoding.spec.whatwg.org/index-windows-1254.txt",
331         },
332         {
333                 "Windows 1255",
334                 "Windows1255",
335                 "",
336                 "Windows1255",
337                 encoding.ASCIISub,
338                 "http://encoding.spec.whatwg.org/index-windows-1255.txt",
339         },
340         {
341                 "Windows 1256",
342                 "Windows1256",
343                 "",
344                 "Windows1256",
345                 encoding.ASCIISub,
346                 "http://encoding.spec.whatwg.org/index-windows-1256.txt",
347         },
348         {
349                 "Windows 1257",
350                 "Windows1257",
351                 "",
352                 "Windows1257",
353                 encoding.ASCIISub,
354                 "http://encoding.spec.whatwg.org/index-windows-1257.txt",
355         },
356         {
357                 "Windows 1258",
358                 "Windows1258",
359                 "",
360                 "Windows1258",
361                 encoding.ASCIISub,
362                 "http://encoding.spec.whatwg.org/index-windows-1258.txt",
363         },
364         {
365                 "X-User-Defined",
366                 "XUserDefined",
367                 "It is defined at http://encoding.spec.whatwg.org/#x-user-defined",
368                 "XUserDefined",
369                 encoding.ASCIISub,
370                 ascii +
371                         "\uf780\uf781\uf782\uf783\uf784\uf785\uf786\uf787" +
372                         "\uf788\uf789\uf78a\uf78b\uf78c\uf78d\uf78e\uf78f" +
373                         "\uf790\uf791\uf792\uf793\uf794\uf795\uf796\uf797" +
374                         "\uf798\uf799\uf79a\uf79b\uf79c\uf79d\uf79e\uf79f" +
375                         "\uf7a0\uf7a1\uf7a2\uf7a3\uf7a4\uf7a5\uf7a6\uf7a7" +
376                         "\uf7a8\uf7a9\uf7aa\uf7ab\uf7ac\uf7ad\uf7ae\uf7af" +
377                         "\uf7b0\uf7b1\uf7b2\uf7b3\uf7b4\uf7b5\uf7b6\uf7b7" +
378                         "\uf7b8\uf7b9\uf7ba\uf7bb\uf7bc\uf7bd\uf7be\uf7bf" +
379                         "\uf7c0\uf7c1\uf7c2\uf7c3\uf7c4\uf7c5\uf7c6\uf7c7" +
380                         "\uf7c8\uf7c9\uf7ca\uf7cb\uf7cc\uf7cd\uf7ce\uf7cf" +
381                         "\uf7d0\uf7d1\uf7d2\uf7d3\uf7d4\uf7d5\uf7d6\uf7d7" +
382                         "\uf7d8\uf7d9\uf7da\uf7db\uf7dc\uf7dd\uf7de\uf7df" +
383                         "\uf7e0\uf7e1\uf7e2\uf7e3\uf7e4\uf7e5\uf7e6\uf7e7" +
384                         "\uf7e8\uf7e9\uf7ea\uf7eb\uf7ec\uf7ed\uf7ee\uf7ef" +
385                         "\uf7f0\uf7f1\uf7f2\uf7f3\uf7f4\uf7f5\uf7f6\uf7f7" +
386                         "\uf7f8\uf7f9\uf7fa\uf7fb\uf7fc\uf7fd\uf7fe\uf7ff",
387         },
388 }
389
390 func getWHATWG(url string) string {
391         res, err := http.Get(url)
392         if err != nil {
393                 log.Fatalf("%q: Get: %v", url, err)
394         }
395         defer res.Body.Close()
396
397         mapping := make([]rune, 128)
398         for i := range mapping {
399                 mapping[i] = '\ufffd'
400         }
401
402         scanner := bufio.NewScanner(res.Body)
403         for scanner.Scan() {
404                 s := strings.TrimSpace(scanner.Text())
405                 if s == "" || s[0] == '#' {
406                         continue
407                 }
408                 x, y := 0, 0
409                 if _, err := fmt.Sscanf(s, "%d\t0x%x", &x, &y); err != nil {
410                         log.Fatalf("could not parse %q", s)
411                 }
412                 if x < 0 || 128 <= x {
413                         log.Fatalf("code %d is out of range", x)
414                 }
415                 if 0x80 <= y && y < 0xa0 {
416                         // We diverge from the WHATWG spec by mapping control characters
417                         // in the range [0x80, 0xa0) to U+FFFD.
418                         continue
419                 }
420                 mapping[x] = rune(y)
421         }
422         return ascii + string(mapping)
423 }
424
425 func getUCM(url string) string {
426         res, err := http.Get(url)
427         if err != nil {
428                 log.Fatalf("%q: Get: %v", url, err)
429         }
430         defer res.Body.Close()
431
432         mapping := make([]rune, 256)
433         for i := range mapping {
434                 mapping[i] = '\ufffd'
435         }
436
437         charsFound := 0
438         scanner := bufio.NewScanner(res.Body)
439         for scanner.Scan() {
440                 s := strings.TrimSpace(scanner.Text())
441                 if s == "" || s[0] == '#' {
442                         continue
443                 }
444                 var c byte
445                 var r rune
446                 if _, err := fmt.Sscanf(s, `<U%x> \x%x |0`, &r, &c); err != nil {
447                         continue
448                 }
449                 mapping[c] = r
450                 charsFound++
451         }
452
453         if charsFound < 200 {
454                 log.Fatalf("%q: only %d characters found (wrong page format?)", url, charsFound)
455         }
456
457         return string(mapping)
458 }
459
460 func main() {
461         mibs := map[string]bool{}
462         all := []string{}
463
464         w := gen.NewCodeWriter()
465         defer w.WriteGoFile("tables.go", "charmap")
466
467         printf := func(s string, a ...interface{}) { fmt.Fprintf(w, s, a...) }
468
469         printf("import (\n")
470         printf("\t\"golang.org/x/text/encoding\"\n")
471         printf("\t\"golang.org/x/text/encoding/internal/identifier\"\n")
472         printf(")\n\n")
473         for _, e := range encodings {
474                 varNames := strings.Split(e.varName, ",")
475                 all = append(all, varNames...)
476                 varName := varNames[0]
477                 switch {
478                 case strings.HasPrefix(e.mapping, "http://encoding.spec.whatwg.org/"):
479                         e.mapping = getWHATWG(e.mapping)
480                 case strings.HasPrefix(e.mapping, "http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/"):
481                         e.mapping = getUCM(e.mapping)
482                 }
483
484                 asciiSuperset, low := strings.HasPrefix(e.mapping, ascii), 0x00
485                 if asciiSuperset {
486                         low = 0x80
487                 }
488                 lvn := 1
489                 if strings.HasPrefix(varName, "ISO") || strings.HasPrefix(varName, "KOI") {
490                         lvn = 3
491                 }
492                 lowerVarName := strings.ToLower(varName[:lvn]) + varName[lvn:]
493                 printf("// %s is the %s encoding.\n", varName, e.name)
494                 if e.comment != "" {
495                         printf("//\n// %s\n", e.comment)
496                 }
497                 printf("var %s *Charmap = &%s\n\nvar %s = Charmap{\nname: %q,\n",
498                         varName, lowerVarName, lowerVarName, e.name)
499                 if mibs[e.mib] {
500                         log.Fatalf("MIB type %q declared multiple times.", e.mib)
501                 }
502                 printf("mib: identifier.%s,\n", e.mib)
503                 printf("asciiSuperset: %t,\n", asciiSuperset)
504                 printf("low: 0x%02x,\n", low)
505                 printf("replacement: 0x%02x,\n", e.replacement)
506
507                 printf("decode: [256]utf8Enc{\n")
508                 i, backMapping := 0, map[rune]byte{}
509                 for _, c := range e.mapping {
510                         if _, ok := backMapping[c]; !ok && c != utf8.RuneError {
511                                 backMapping[c] = byte(i)
512                         }
513                         var buf [8]byte
514                         n := utf8.EncodeRune(buf[:], c)
515                         if n > 3 {
516                                 panic(fmt.Sprintf("rune %q (%U) is too long", c, c))
517                         }
518                         printf("{%d,[3]byte{0x%02x,0x%02x,0x%02x}},", n, buf[0], buf[1], buf[2])
519                         if i%2 == 1 {
520                                 printf("\n")
521                         }
522                         i++
523                 }
524                 printf("},\n")
525
526                 printf("encode: [256]uint32{\n")
527                 encode := make([]uint32, 0, 256)
528                 for c, i := range backMapping {
529                         encode = append(encode, uint32(i)<<24|uint32(c))
530                 }
531                 sort.Sort(byRune(encode))
532                 for len(encode) < cap(encode) {
533                         encode = append(encode, encode[len(encode)-1])
534                 }
535                 for i, enc := range encode {
536                         printf("0x%08x,", enc)
537                         if i%8 == 7 {
538                                 printf("\n")
539                         }
540                 }
541                 printf("},\n}\n")
542
543                 // Add an estimate of the size of a single Charmap{} struct value, which
544                 // includes two 256 elem arrays of 4 bytes and some extra fields, which
545                 // align to 3 uint64s on 64-bit architectures.
546                 w.Size += 2*4*256 + 3*8
547         }
548         // TODO: add proper line breaking.
549         printf("var listAll = []encoding.Encoding{\n%s,\n}\n\n", strings.Join(all, ",\n"))
550 }
551
552 type byRune []uint32
553
554 func (b byRune) Len() int           { return len(b) }
555 func (b byRune) Less(i, j int) bool { return b[i]&0xffffff < b[j]&0xffffff }
556 func (b byRune) Swap(i, j int)      { b[i], b[j] = b[j], b[i] }