vendor/golang.org/x/text/secure/precis/gen.go

   1 // Copyright 2015 The Go Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style
   3 // license that can be found in the LICENSE file.
   4
   5 // Unicode table generator.
   6 // Data read from the web.
   7
   8 // +build ignore
   9
  10 package main
  11
  12 import (
  13         "flag"
  14         "log"
  15         "unicode"
  16         "unicode/utf8"
  17
  18         "golang.org/x/text/internal/gen"
  19         "golang.org/x/text/internal/triegen"
  20         "golang.org/x/text/internal/ucd"
  21         "golang.org/x/text/unicode/norm"
  22         "golang.org/x/text/unicode/rangetable"
  23 )
  24
  25 var outputFile = flag.String("output", "tables.go", "output file for generated tables; default tables.go")
  26
  27 var assigned, disallowedRunes *unicode.RangeTable
  28
  29 var runeCategory = map[rune]category{}
  30
  31 var overrides = map[category]category{
  32         viramaModifier: viramaJoinT,
  33         greek:          greekJoinT,
  34         hebrew:         hebrewJoinT,
  35 }
  36
  37 func setCategory(r rune, cat category) {
  38         if c, ok := runeCategory[r]; ok {
  39                 if override, ok := overrides[c]; cat == joiningT && ok {
  40                         cat = override
  41                 } else {
  42                         log.Fatalf("%U: multiple categories for rune (%v and %v)", r, c, cat)
  43                 }
  44         }
  45         runeCategory[r] = cat
  46 }
  47
  48 func init() {
  49         if numCategories > 1<<propShift {
  50                 log.Fatalf("Number of categories is %d; may at most be %d", numCategories, 1<<propShift)
  51         }
  52 }
  53
  54 func main() {
  55         gen.Init()
  56
  57         // Load data
  58         runes := []rune{}
  59         // PrecisIgnorableProperties: https://tools.ietf.org/html/rfc7564#section-9.13
  60         ucd.Parse(gen.OpenUCDFile("DerivedCoreProperties.txt"), func(p *ucd.Parser) {
  61                 if p.String(1) == "Default_Ignorable_Code_Point" {
  62                         runes = append(runes, p.Rune(0))
  63                 }
  64         })
  65         ucd.Parse(gen.OpenUCDFile("PropList.txt"), func(p *ucd.Parser) {
  66                 switch p.String(1) {
  67                 case "Noncharacter_Code_Point":
  68                         runes = append(runes, p.Rune(0))
  69                 }
  70         })
  71         // OldHangulJamo: https://tools.ietf.org/html/rfc5892#section-2.9
  72         ucd.Parse(gen.OpenUCDFile("HangulSyllableType.txt"), func(p *ucd.Parser) {
  73                 switch p.String(1) {
  74                 case "L", "V", "T":
  75                         runes = append(runes, p.Rune(0))
  76                 }
  77         })
  78
  79         disallowedRunes = rangetable.New(runes...)
  80         assigned = rangetable.Assigned(unicode.Version)
  81
  82         // Load category data.
  83         runeCategory['l'] = latinSmallL
  84         ucd.Parse(gen.OpenUCDFile("UnicodeData.txt"), func(p *ucd.Parser) {
  85                 const cccVirama = 9
  86                 if p.Int(ucd.CanonicalCombiningClass) == cccVirama {
  87                         setCategory(p.Rune(0), viramaModifier)
  88                 }
  89         })
  90         ucd.Parse(gen.OpenUCDFile("Scripts.txt"), func(p *ucd.Parser) {
  91                 switch p.String(1) {
  92                 case "Greek":
  93                         setCategory(p.Rune(0), greek)
  94                 case "Hebrew":
  95                         setCategory(p.Rune(0), hebrew)
  96                 case "Hiragana", "Katakana", "Han":
  97                         setCategory(p.Rune(0), japanese)
  98                 }
  99         })
 100
 101         // Set the rule categories associated with exceptions. This overrides any
 102         // previously set categories. The original categories are manually
 103         // reintroduced in the categoryTransitions table.
 104         for r, e := range exceptions {
 105                 if e.cat != 0 {
 106                         runeCategory[r] = e.cat
 107                 }
 108         }
 109         cat := map[string]category{
 110                 "L": joiningL,
 111                 "D": joiningD,
 112                 "T": joiningT,
 113
 114                 "R": joiningR,
 115         }
 116         ucd.Parse(gen.OpenUCDFile("extracted/DerivedJoiningType.txt"), func(p *ucd.Parser) {
 117                 switch v := p.String(1); v {
 118                 case "L", "D", "T", "R":
 119                         setCategory(p.Rune(0), cat[v])
 120                 }
 121         })
 122
 123         writeTables()
 124         gen.Repackage("gen_trieval.go", "trieval.go", "precis")
 125 }
 126
 127 type exception struct {
 128         prop property
 129         cat  category
 130 }
 131
 132 func init() {
 133         // Programmatically add the Arabic and Indic digits to the exceptions map.
 134         // See comment in the exceptions map below why these are marked disallowed.
 135         for i := rune(0); i <= 9; i++ {
 136                 exceptions[0x0660+i] = exception{
 137                         prop: disallowed,
 138                         cat:  arabicIndicDigit,
 139                 }
 140                 exceptions[0x06F0+i] = exception{
 141                         prop: disallowed,
 142                         cat:  extendedArabicIndicDigit,
 143                 }
 144         }
 145 }
 146
 147 // The Exceptions class as defined in RFC 5892
 148 // https://tools.ietf.org/html/rfc5892#section-2.6
 149 var exceptions = map[rune]exception{
 150         0x00DF: {prop: pValid},
 151         0x03C2: {prop: pValid},
 152         0x06FD: {prop: pValid},
 153         0x06FE: {prop: pValid},
 154         0x0F0B: {prop: pValid},
 155         0x3007: {prop: pValid},
 156
 157         // ContextO|J rules are marked as disallowed, taking a "guilty until proven
 158         // innocent" approach. The main reason for this is that the check for
 159         // whether a context rule should be applied can be moved to the logic for
 160         // handing disallowed runes, taken it off the common path. The exception to
 161         // this rule is for katakanaMiddleDot, as the rule logic is handled without
 162         // using a rule function.
 163
 164         // ContextJ (Join control)
 165         0x200C: {prop: disallowed, cat: zeroWidthNonJoiner},
 166         0x200D: {prop: disallowed, cat: zeroWidthJoiner},
 167
 168         // ContextO
 169         0x00B7: {prop: disallowed, cat: middleDot},
 170         0x0375: {prop: disallowed, cat: greekLowerNumeralSign},
 171         0x05F3: {prop: disallowed, cat: hebrewPreceding}, // punctuation Geresh
 172         0x05F4: {prop: disallowed, cat: hebrewPreceding}, // punctuation Gershayim
 173         0x30FB: {prop: pValid, cat: katakanaMiddleDot},
 174
 175         // These are officially ContextO, but the implementation does not require
 176         // special treatment of these, so we simply mark them as valid.
 177         0x0660: {prop: pValid},
 178         0x0661: {prop: pValid},
 179         0x0662: {prop: pValid},
 180         0x0663: {prop: pValid},
 181         0x0664: {prop: pValid},
 182         0x0665: {prop: pValid},
 183         0x0666: {prop: pValid},
 184         0x0667: {prop: pValid},
 185         0x0668: {prop: pValid},
 186         0x0669: {prop: pValid},
 187         0x06F0: {prop: pValid},
 188         0x06F1: {prop: pValid},
 189         0x06F2: {prop: pValid},
 190         0x06F3: {prop: pValid},
 191         0x06F4: {prop: pValid},
 192         0x06F5: {prop: pValid},
 193         0x06F6: {prop: pValid},
 194         0x06F7: {prop: pValid},
 195         0x06F8: {prop: pValid},
 196         0x06F9: {prop: pValid},
 197
 198         0x0640: {prop: disallowed},
 199         0x07FA: {prop: disallowed},
 200         0x302E: {prop: disallowed},
 201         0x302F: {prop: disallowed},
 202         0x3031: {prop: disallowed},
 203         0x3032: {prop: disallowed},
 204         0x3033: {prop: disallowed},
 205         0x3034: {prop: disallowed},
 206         0x3035: {prop: disallowed},
 207         0x303B: {prop: disallowed},
 208 }
 209
 210 // LetterDigits: https://tools.ietf.org/html/rfc5892#section-2.1
 211 // r in {Ll, Lu, Lo, Nd, Lm, Mn, Mc}.
 212 func isLetterDigits(r rune) bool {
 213         return unicode.In(r,
 214                 unicode.Ll, unicode.Lu, unicode.Lm, unicode.Lo, // Letters
 215                 unicode.Mn, unicode.Mc, // Modifiers
 216                 unicode.Nd, // Digits
 217         )
 218 }
 219
 220 func isIdDisAndFreePVal(r rune) bool {
 221         return unicode.In(r,
 222                 // OtherLetterDigits: https://tools.ietf.org/html/rfc7564#section-9.18
 223                 // r in in {Lt, Nl, No, Me}
 224                 unicode.Lt, unicode.Nl, unicode.No, // Other letters / numbers
 225                 unicode.Me, // Modifiers
 226
 227                 // Spaces: https://tools.ietf.org/html/rfc7564#section-9.14
 228                 // r in in {Zs}
 229                 unicode.Zs,
 230
 231                 // Symbols: https://tools.ietf.org/html/rfc7564#section-9.15
 232                 // r in {Sm, Sc, Sk, So}
 233                 unicode.Sm, unicode.Sc, unicode.Sk, unicode.So,
 234
 235                 // Punctuation: https://tools.ietf.org/html/rfc7564#section-9.16
 236                 // r in {Pc, Pd, Ps, Pe, Pi, Pf, Po}
 237                 unicode.Pc, unicode.Pd, unicode.Ps, unicode.Pe,
 238                 unicode.Pi, unicode.Pf, unicode.Po,
 239         )
 240 }
 241
 242 // HasCompat: https://tools.ietf.org/html/rfc7564#section-9.17
 243 func hasCompat(r rune) bool {
 244         return !norm.NFKC.IsNormalString(string(r))
 245 }
 246
 247 // From https://tools.ietf.org/html/rfc5892:
 248 //
 249 // If .cp. .in. Exceptions Then Exceptions(cp);
 250 //   Else If .cp. .in. BackwardCompatible Then BackwardCompatible(cp);
 251 //   Else If .cp. .in. Unassigned Then UNASSIGNED;
 252 //   Else If .cp. .in. ASCII7 Then PVALID;
 253 //   Else If .cp. .in. JoinControl Then CONTEXTJ;
 254 //   Else If .cp. .in. OldHangulJamo Then DISALLOWED;
 255 //   Else If .cp. .in. PrecisIgnorableProperties Then DISALLOWED;
 256 //   Else If .cp. .in. Controls Then DISALLOWED;
 257 //   Else If .cp. .in. HasCompat Then ID_DIS or FREE_PVAL;
 258 //   Else If .cp. .in. LetterDigits Then PVALID;
 259 //   Else If .cp. .in. OtherLetterDigits Then ID_DIS or FREE_PVAL;
 260 //   Else If .cp. .in. Spaces Then ID_DIS or FREE_PVAL;
 261 //   Else If .cp. .in. Symbols Then ID_DIS or FREE_PVAL;
 262 //   Else If .cp. .in. Punctuation Then ID_DIS or FREE_PVAL;
 263 //   Else DISALLOWED;
 264
 265 func writeTables() {
 266         propTrie := triegen.NewTrie("derivedProperties")
 267         w := gen.NewCodeWriter()
 268         defer w.WriteGoFile(*outputFile, "precis")
 269         gen.WriteUnicodeVersion(w)
 270
 271         // Iterate over all the runes...
 272         for i := rune(0); i < unicode.MaxRune; i++ {
 273                 r := rune(i)
 274
 275                 if !utf8.ValidRune(r) {
 276                         continue
 277                 }
 278
 279                 e, ok := exceptions[i]
 280                 p := e.prop
 281                 switch {
 282                 case ok:
 283                 case !unicode.In(r, assigned):
 284                         p = unassigned
 285                 case r >= 0x0021 && r <= 0x007e: // Is ASCII 7
 286                         p = pValid
 287                 case unicode.In(r, disallowedRunes, unicode.Cc):
 288                         p = disallowed
 289                 case hasCompat(r):
 290                         p = idDisOrFreePVal
 291                 case isLetterDigits(r):
 292                         p = pValid
 293                 case isIdDisAndFreePVal(r):
 294                         p = idDisOrFreePVal
 295                 default:
 296                         p = disallowed
 297                 }
 298                 cat := runeCategory[r]
 299                 // Don't set category for runes that are disallowed.
 300                 if p == disallowed {
 301                         cat = exceptions[r].cat
 302                 }
 303                 propTrie.Insert(r, uint64(p)|uint64(cat))
 304         }
 305         sz, err := propTrie.Gen(w)
 306         if err != nil {
 307                 log.Fatal(err)
 308         }
 309         w.Size += sz
 310 }