1 // Copyright 2015 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
5 // Unicode table generator.
6 // Data read from the web.
18 "golang.org/x/text/internal/gen"
19 "golang.org/x/text/internal/triegen"
20 "golang.org/x/text/internal/ucd"
21 "golang.org/x/text/unicode/norm"
22 "golang.org/x/text/unicode/rangetable"
25 var outputFile = flag.String("output", "tables.go", "output file for generated tables; default tables.go")
27 var assigned, disallowedRunes *unicode.RangeTable
29 var runeCategory = map[rune]category{}
31 var overrides = map[category]category{
32 viramaModifier: viramaJoinT,
37 func setCategory(r rune, cat category) {
38 if c, ok := runeCategory[r]; ok {
39 if override, ok := overrides[c]; cat == joiningT && ok {
42 log.Fatalf("%U: multiple categories for rune (%v and %v)", r, c, cat)
49 if numCategories > 1<<propShift {
50 log.Fatalf("Number of categories is %d; may at most be %d", numCategories, 1<<propShift)
59 // PrecisIgnorableProperties: https://tools.ietf.org/html/rfc7564#section-9.13
60 ucd.Parse(gen.OpenUCDFile("DerivedCoreProperties.txt"), func(p *ucd.Parser) {
61 if p.String(1) == "Default_Ignorable_Code_Point" {
62 runes = append(runes, p.Rune(0))
65 ucd.Parse(gen.OpenUCDFile("PropList.txt"), func(p *ucd.Parser) {
67 case "Noncharacter_Code_Point":
68 runes = append(runes, p.Rune(0))
71 // OldHangulJamo: https://tools.ietf.org/html/rfc5892#section-2.9
72 ucd.Parse(gen.OpenUCDFile("HangulSyllableType.txt"), func(p *ucd.Parser) {
75 runes = append(runes, p.Rune(0))
79 disallowedRunes = rangetable.New(runes...)
80 assigned = rangetable.Assigned(unicode.Version)
82 // Load category data.
83 runeCategory['l'] = latinSmallL
84 ucd.Parse(gen.OpenUCDFile("UnicodeData.txt"), func(p *ucd.Parser) {
86 if p.Int(ucd.CanonicalCombiningClass) == cccVirama {
87 setCategory(p.Rune(0), viramaModifier)
90 ucd.Parse(gen.OpenUCDFile("Scripts.txt"), func(p *ucd.Parser) {
93 setCategory(p.Rune(0), greek)
95 setCategory(p.Rune(0), hebrew)
96 case "Hiragana", "Katakana", "Han":
97 setCategory(p.Rune(0), japanese)
101 // Set the rule categories associated with exceptions. This overrides any
102 // previously set categories. The original categories are manually
103 // reintroduced in the categoryTransitions table.
104 for r, e := range exceptions {
106 runeCategory[r] = e.cat
109 cat := map[string]category{
116 ucd.Parse(gen.OpenUCDFile("extracted/DerivedJoiningType.txt"), func(p *ucd.Parser) {
117 switch v := p.String(1); v {
118 case "L", "D", "T", "R":
119 setCategory(p.Rune(0), cat[v])
124 gen.Repackage("gen_trieval.go", "trieval.go", "precis")
127 type exception struct {
133 // Programmatically add the Arabic and Indic digits to the exceptions map.
134 // See comment in the exceptions map below why these are marked disallowed.
135 for i := rune(0); i <= 9; i++ {
136 exceptions[0x0660+i] = exception{
138 cat: arabicIndicDigit,
140 exceptions[0x06F0+i] = exception{
142 cat: extendedArabicIndicDigit,
147 // The Exceptions class as defined in RFC 5892
148 // https://tools.ietf.org/html/rfc5892#section-2.6
149 var exceptions = map[rune]exception{
150 0x00DF: {prop: pValid},
151 0x03C2: {prop: pValid},
152 0x06FD: {prop: pValid},
153 0x06FE: {prop: pValid},
154 0x0F0B: {prop: pValid},
155 0x3007: {prop: pValid},
157 // ContextO|J rules are marked as disallowed, taking a "guilty until proven
158 // innocent" approach. The main reason for this is that the check for
159 // whether a context rule should be applied can be moved to the logic for
160 // handing disallowed runes, taken it off the common path. The exception to
161 // this rule is for katakanaMiddleDot, as the rule logic is handled without
162 // using a rule function.
164 // ContextJ (Join control)
165 0x200C: {prop: disallowed, cat: zeroWidthNonJoiner},
166 0x200D: {prop: disallowed, cat: zeroWidthJoiner},
169 0x00B7: {prop: disallowed, cat: middleDot},
170 0x0375: {prop: disallowed, cat: greekLowerNumeralSign},
171 0x05F3: {prop: disallowed, cat: hebrewPreceding}, // punctuation Geresh
172 0x05F4: {prop: disallowed, cat: hebrewPreceding}, // punctuation Gershayim
173 0x30FB: {prop: pValid, cat: katakanaMiddleDot},
175 // These are officially ContextO, but the implementation does not require
176 // special treatment of these, so we simply mark them as valid.
177 0x0660: {prop: pValid},
178 0x0661: {prop: pValid},
179 0x0662: {prop: pValid},
180 0x0663: {prop: pValid},
181 0x0664: {prop: pValid},
182 0x0665: {prop: pValid},
183 0x0666: {prop: pValid},
184 0x0667: {prop: pValid},
185 0x0668: {prop: pValid},
186 0x0669: {prop: pValid},
187 0x06F0: {prop: pValid},
188 0x06F1: {prop: pValid},
189 0x06F2: {prop: pValid},
190 0x06F3: {prop: pValid},
191 0x06F4: {prop: pValid},
192 0x06F5: {prop: pValid},
193 0x06F6: {prop: pValid},
194 0x06F7: {prop: pValid},
195 0x06F8: {prop: pValid},
196 0x06F9: {prop: pValid},
198 0x0640: {prop: disallowed},
199 0x07FA: {prop: disallowed},
200 0x302E: {prop: disallowed},
201 0x302F: {prop: disallowed},
202 0x3031: {prop: disallowed},
203 0x3032: {prop: disallowed},
204 0x3033: {prop: disallowed},
205 0x3034: {prop: disallowed},
206 0x3035: {prop: disallowed},
207 0x303B: {prop: disallowed},
210 // LetterDigits: https://tools.ietf.org/html/rfc5892#section-2.1
211 // r in {Ll, Lu, Lo, Nd, Lm, Mn, Mc}.
212 func isLetterDigits(r rune) bool {
214 unicode.Ll, unicode.Lu, unicode.Lm, unicode.Lo, // Letters
215 unicode.Mn, unicode.Mc, // Modifiers
216 unicode.Nd, // Digits
220 func isIdDisAndFreePVal(r rune) bool {
222 // OtherLetterDigits: https://tools.ietf.org/html/rfc7564#section-9.18
223 // r in in {Lt, Nl, No, Me}
224 unicode.Lt, unicode.Nl, unicode.No, // Other letters / numbers
225 unicode.Me, // Modifiers
227 // Spaces: https://tools.ietf.org/html/rfc7564#section-9.14
231 // Symbols: https://tools.ietf.org/html/rfc7564#section-9.15
232 // r in {Sm, Sc, Sk, So}
233 unicode.Sm, unicode.Sc, unicode.Sk, unicode.So,
235 // Punctuation: https://tools.ietf.org/html/rfc7564#section-9.16
236 // r in {Pc, Pd, Ps, Pe, Pi, Pf, Po}
237 unicode.Pc, unicode.Pd, unicode.Ps, unicode.Pe,
238 unicode.Pi, unicode.Pf, unicode.Po,
242 // HasCompat: https://tools.ietf.org/html/rfc7564#section-9.17
243 func hasCompat(r rune) bool {
244 return !norm.NFKC.IsNormalString(string(r))
247 // From https://tools.ietf.org/html/rfc5892:
249 // If .cp. .in. Exceptions Then Exceptions(cp);
250 // Else If .cp. .in. BackwardCompatible Then BackwardCompatible(cp);
251 // Else If .cp. .in. Unassigned Then UNASSIGNED;
252 // Else If .cp. .in. ASCII7 Then PVALID;
253 // Else If .cp. .in. JoinControl Then CONTEXTJ;
254 // Else If .cp. .in. OldHangulJamo Then DISALLOWED;
255 // Else If .cp. .in. PrecisIgnorableProperties Then DISALLOWED;
256 // Else If .cp. .in. Controls Then DISALLOWED;
257 // Else If .cp. .in. HasCompat Then ID_DIS or FREE_PVAL;
258 // Else If .cp. .in. LetterDigits Then PVALID;
259 // Else If .cp. .in. OtherLetterDigits Then ID_DIS or FREE_PVAL;
260 // Else If .cp. .in. Spaces Then ID_DIS or FREE_PVAL;
261 // Else If .cp. .in. Symbols Then ID_DIS or FREE_PVAL;
262 // Else If .cp. .in. Punctuation Then ID_DIS or FREE_PVAL;
266 propTrie := triegen.NewTrie("derivedProperties")
267 w := gen.NewCodeWriter()
268 defer w.WriteGoFile(*outputFile, "precis")
269 gen.WriteUnicodeVersion(w)
271 // Iterate over all the runes...
272 for i := rune(0); i < unicode.MaxRune; i++ {
275 if !utf8.ValidRune(r) {
279 e, ok := exceptions[i]
283 case !unicode.In(r, assigned):
285 case r >= 0x0021 && r <= 0x007e: // Is ASCII 7
287 case unicode.In(r, disallowedRunes, unicode.Cc):
291 case isLetterDigits(r):
293 case isIdDisAndFreePVal(r):
298 cat := runeCategory[r]
299 // Don't set category for runes that are disallowed.
301 cat = exceptions[r].cat
303 propTrie.Insert(r, uint64(p)|uint64(cat))
305 sz, err := propTrie.Gen(w)