vendor/golang.org/x/text/feature/plural/gen.go

   1 // Copyright 2016 The Go Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style
   3 // license that can be found in the LICENSE file.
   4
   5 // +build ignore
   6
   7 package main
   8
   9 // This file generates data for the CLDR plural rules, as defined in
  10 //    http://unicode.org/reports/tr35/tr35-numbers.html#Language_Plural_Rules
  11 //
  12 // We assume a slightly simplified grammar:
  13 //
  14 //              condition     = and_condition ('or' and_condition)* samples
  15 //              and_condition = relation ('and' relation)*
  16 //              relation      = expr ('=' | '!=') range_list
  17 //              expr          = operand ('%' '10' '0'* )?
  18 //              operand       = 'n' | 'i' | 'f' | 't' | 'v' | 'w'
  19 //              range_list    = (range | value) (',' range_list)*
  20 //              range         = value'..'value
  21 //              value         = digit+
  22 //              digit         = 0|1|2|3|4|5|6|7|8|9
  23 //
  24 //              samples       = ('@integer' sampleList)?
  25 //                              ('@decimal' sampleList)?
  26 //              sampleList    = sampleRange (',' sampleRange)* (',' ('…'|'...'))?
  27 //              sampleRange   = decimalValue ('~' decimalValue)?
  28 //              decimalValue  = value ('.' value)?
  29 //
  30 //              Symbol  Value
  31 //              n       absolute value of the source number (integer and decimals).
  32 //              i       integer digits of n.
  33 //              v       number of visible fraction digits in n, with trailing zeros.
  34 //              w       number of visible fraction digits in n, without trailing zeros.
  35 //              f       visible fractional digits in n, with trailing zeros.
  36 //              t       visible fractional digits in n, without trailing zeros.
  37 //
  38 // The algorithm for which the data is generated is based on the following
  39 // observations
  40 //
  41 //    - the number of different sets of numbers which the plural rules use to
  42 //      test inclusion is limited,
  43 //    - most numbers that are tested on are < 100
  44 //
  45 // This allows us to define a bitmap for each number < 100 where a bit i
  46 // indicates whether this number is included in some defined set i.
  47 // The function matchPlural in plural.go defines how we can subsequently use
  48 // this data to determine inclusion.
  49 //
  50 // There are a few languages for which this doesn't work. For one Italian and
  51 // Azerbaijan, which both test against numbers > 100 for ordinals and Breton,
  52 // which considers whether numbers are multiples of hundreds. The model here
  53 // could be extended to handle Italian and Azerbaijan fairly easily (by
  54 // considering the numbers 100, 200, 300, ..., 800, 900 in addition to the first
  55 // 100), but for now it seems easier to just hard-code these cases.
  56
  57 import (
  58         "bufio"
  59         "bytes"
  60         "flag"
  61         "fmt"
  62         "log"
  63         "strconv"
  64         "strings"
  65
  66         "golang.org/x/text/internal"
  67         "golang.org/x/text/internal/gen"
  68         "golang.org/x/text/language"
  69         "golang.org/x/text/unicode/cldr"
  70 )
  71
  72 var (
  73         test = flag.Bool("test", false,
  74                 "test existing tables; can be used to compare web data with package data.")
  75         outputFile     = flag.String("output", "tables.go", "output file")
  76         outputTestFile = flag.String("testoutput", "data_test.go", "output file")
  77
  78         draft = flag.String("draft",
  79                 "contributed",
  80                 `Minimal draft requirements (approved, contributed, provisional, unconfirmed).`)
  81 )
  82
  83 func main() {
  84         gen.Init()
  85
  86         const pkg = "plural"
  87
  88         gen.Repackage("gen_common.go", "common.go", pkg)
  89         // Read the CLDR zip file.
  90         r := gen.OpenCLDRCoreZip()
  91         defer r.Close()
  92
  93         d := &cldr.Decoder{}
  94         d.SetDirFilter("supplemental", "main")
  95         d.SetSectionFilter("numbers", "plurals")
  96         data, err := d.DecodeZip(r)
  97         if err != nil {
  98                 log.Fatalf("DecodeZip: %v", err)
  99         }
 100
 101         w := gen.NewCodeWriter()
 102         defer w.WriteGoFile(*outputFile, pkg)
 103
 104         gen.WriteCLDRVersion(w)
 105
 106         genPlurals(w, data)
 107
 108         w = gen.NewCodeWriter()
 109         defer w.WriteGoFile(*outputTestFile, pkg)
 110
 111         genPluralsTests(w, data)
 112 }
 113
 114 type pluralTest struct {
 115         locales string   // space-separated list of locales for this test
 116         form    int      // Use int instead of Form to simplify generation.
 117         integer []string // Entries of the form \d+ or \d+~\d+
 118         decimal []string // Entries of the form \f+ or \f+ +~\f+, where f is \d+\.\d+
 119 }
 120
 121 func genPluralsTests(w *gen.CodeWriter, data *cldr.CLDR) {
 122         w.WriteType(pluralTest{})
 123
 124         for _, plurals := range data.Supplemental().Plurals {
 125                 if plurals.Type == "" {
 126                         // The empty type is reserved for plural ranges.
 127                         continue
 128                 }
 129                 tests := []pluralTest{}
 130
 131                 for _, pRules := range plurals.PluralRules {
 132                         for _, rule := range pRules.PluralRule {
 133                                 test := pluralTest{
 134                                         locales: pRules.Locales,
 135                                         form:    int(countMap[rule.Count]),
 136                                 }
 137                                 scan := bufio.NewScanner(strings.NewReader(rule.Data()))
 138                                 scan.Split(splitTokens)
 139                                 var p *[]string
 140                                 for scan.Scan() {
 141                                         switch t := scan.Text(); t {
 142                                         case "@integer":
 143                                                 p = &test.integer
 144                                         case "@decimal":
 145                                                 p = &test.decimal
 146                                         case ",", "…":
 147                                         default:
 148                                                 if p != nil {
 149                                                         *p = append(*p, t)
 150                                                 }
 151                                         }
 152                                 }
 153                                 tests = append(tests, test)
 154                         }
 155                 }
 156                 w.WriteVar(plurals.Type+"Tests", tests)
 157         }
 158 }
 159
 160 func genPlurals(w *gen.CodeWriter, data *cldr.CLDR) {
 161         for _, plurals := range data.Supplemental().Plurals {
 162                 if plurals.Type == "" {
 163                         continue
 164                 }
 165                 // Initialize setMap and inclusionMasks. They are already populated with
 166                 // a few entries to serve as an example and to assign nice numbers to
 167                 // common cases.
 168
 169                 // setMap contains sets of numbers represented by boolean arrays where
 170                 // a true value for element i means that the number i is included.
 171                 setMap := map[[numN]bool]int{
 172                         // The above init func adds an entry for including all numbers.
 173                         [numN]bool{1: true}: 1, // fix {1} to a nice value
 174                         [numN]bool{2: true}: 2, // fix {2} to a nice value
 175                         [numN]bool{0: true}: 3, // fix {0} to a nice value
 176                 }
 177
 178                 // inclusionMasks contains bit masks for every number under numN to
 179                 // indicate in which set the number is included. Bit 1 << x will be set
 180                 // if it is included in set x.
 181                 inclusionMasks := [numN]uint64{
 182                         // Note: these entries are not complete: more bits will be set along the way.
 183                         0: 1 << 3,
 184                         1: 1 << 1,
 185                         2: 1 << 2,
 186                 }
 187
 188                 // Create set {0..99}. We will assign this set the identifier 0.
 189                 var all [numN]bool
 190                 for i := range all {
 191                         // Mark number i as being included in the set (which has identifier 0).
 192                         inclusionMasks[i] |= 1 << 0
 193                         // Mark number i as included in the set.
 194                         all[i] = true
 195                 }
 196                 // Register the identifier for the set.
 197                 setMap[all] = 0
 198
 199                 rules := []pluralCheck{}
 200                 index := []byte{0}
 201                 langMap := map[int]byte{0: 0} // From compact language index to index
 202
 203                 for _, pRules := range plurals.PluralRules {
 204                         // Parse the rules.
 205                         var conds []orCondition
 206                         for _, rule := range pRules.PluralRule {
 207                                 form := countMap[rule.Count]
 208                                 conds = parsePluralCondition(conds, rule.Data(), form)
 209                         }
 210                         // Encode the rules.
 211                         for _, c := range conds {
 212                                 // If an or condition only has filters, we create an entry for
 213                                 // this filter and the set that contains all values.
 214                                 empty := true
 215                                 for _, b := range c.used {
 216                                         empty = empty && !b
 217                                 }
 218                                 if empty {
 219                                         rules = append(rules, pluralCheck{
 220                                                 cat:   byte(opMod<<opShift) | byte(c.form),
 221                                                 setID: 0, // all values
 222                                         })
 223                                         continue
 224                                 }
 225                                 // We have some entries with values.
 226                                 for i, set := range c.set {
 227                                         if !c.used[i] {
 228                                                 continue
 229                                         }
 230                                         index, ok := setMap[set]
 231                                         if !ok {
 232                                                 index = len(setMap)
 233                                                 setMap[set] = index
 234                                                 for i := range inclusionMasks {
 235                                                         if set[i] {
 236                                                                 inclusionMasks[i] |= 1 << uint64(index)
 237                                                         }
 238                                                 }
 239                                         }
 240                                         rules = append(rules, pluralCheck{
 241                                                 cat:   byte(i<<opShift | andNext),
 242                                                 setID: byte(index),
 243                                         })
 244                                 }
 245                                 // Now set the last entry to the plural form the rule matches.
 246                                 rules[len(rules)-1].cat &^= formMask
 247                                 rules[len(rules)-1].cat |= byte(c.form)
 248                         }
 249                         // Point the relevant locales to the created entries.
 250                         for _, loc := range strings.Split(pRules.Locales, " ") {
 251                                 if strings.TrimSpace(loc) == "" {
 252                                         continue
 253                                 }
 254                                 lang, ok := language.CompactIndex(language.MustParse(loc))
 255                                 if !ok {
 256                                         log.Printf("No compact index for locale %q", loc)
 257                                 }
 258                                 langMap[lang] = byte(len(index) - 1)
 259                         }
 260                         index = append(index, byte(len(rules)))
 261                 }
 262                 w.WriteVar(plurals.Type+"Rules", rules)
 263                 w.WriteVar(plurals.Type+"Index", index)
 264                 // Expand the values.
 265                 langToIndex := make([]byte, language.NumCompactTags)
 266                 for i := range langToIndex {
 267                         for p := i; ; p = int(internal.Parent[p]) {
 268                                 if x, ok := langMap[p]; ok {
 269                                         langToIndex[i] = x
 270                                         break
 271                                 }
 272                         }
 273                 }
 274                 w.WriteVar(plurals.Type+"LangToIndex", langToIndex)
 275                 // Need to convert array to slice because of golang.org/issue/7651.
 276                 // This will allow tables to be dropped when unused. This is especially
 277                 // relevant for the ordinal data, which I suspect won't be used as much.
 278                 w.WriteVar(plurals.Type+"InclusionMasks", inclusionMasks[:])
 279
 280                 if len(rules) > 0xFF {
 281                         log.Fatalf("Too many entries for rules: %#x", len(rules))
 282                 }
 283                 if len(index) > 0xFF {
 284                         log.Fatalf("Too many entries for index: %#x", len(index))
 285                 }
 286                 if len(setMap) > 64 { // maximum number of bits.
 287                         log.Fatalf("Too many entries for setMap: %d", len(setMap))
 288                 }
 289                 w.WriteComment(
 290                         "Slots used for %s: %X of 0xFF rules; %X of 0xFF indexes; %d of 64 sets",
 291                         plurals.Type, len(rules), len(index), len(setMap))
 292                 // Prevent comment from attaching to the next entry.
 293                 fmt.Fprint(w, "\n\n")
 294         }
 295 }
 296
 297 type orCondition struct {
 298         original string // for debugging
 299
 300         form Form
 301         used [32]bool
 302         set  [32][numN]bool
 303 }
 304
 305 func (o *orCondition) add(op opID, mod int, v []int) (ok bool) {
 306         ok = true
 307         for _, x := range v {
 308                 if x >= maxMod {
 309                         ok = false
 310                         break
 311                 }
 312         }
 313         for i := 0; i < numN; i++ {
 314                 m := i
 315                 if mod != 0 {
 316                         m = i % mod
 317                 }
 318                 if !intIn(m, v) {
 319                         o.set[op][i] = false
 320                 }
 321         }
 322         if ok {
 323                 o.used[op] = true
 324         }
 325         return ok
 326 }
 327
 328 func intIn(x int, a []int) bool {
 329         for _, y := range a {
 330                 if x == y {
 331                         return true
 332                 }
 333         }
 334         return false
 335 }
 336
 337 var operandIndex = map[string]opID{
 338         "i": opI,
 339         "n": opN,
 340         "f": opF,
 341         "v": opV,
 342         "w": opW,
 343 }
 344
 345 // parsePluralCondition parses the condition of a single pluralRule and appends
 346 // the resulting or conditions to conds.
 347 //
 348 // Example rules:
 349 //   // Category "one" in English: only allow 1 with no visible fraction
 350 //   i = 1 and v = 0 @integer 1
 351 //
 352 //   // Category "few" in Czech: all numbers with visible fractions
 353 //   v != 0   @decimal ...
 354 //
 355 //   // Category "zero" in Latvian: all multiples of 10 or the numbers 11-19 or
 356 //   // numbers with a fraction 11..19 and no trailing zeros.
 357 //   n % 10 = 0 or n % 100 = 11..19 or v = 2 and f % 100 = 11..19 @integer ...
 358 //
 359 // @integer and @decimal are followed by examples and are not relevant for the
 360 // rule itself. The are used here to signal the termination of the rule.
 361 func parsePluralCondition(conds []orCondition, s string, f Form) []orCondition {
 362         scan := bufio.NewScanner(strings.NewReader(s))
 363         scan.Split(splitTokens)
 364         for {
 365                 cond := orCondition{original: s, form: f}
 366                 // Set all numbers to be allowed for all number classes and restrict
 367                 // from here on.
 368                 for i := range cond.set {
 369                         for j := range cond.set[i] {
 370                                 cond.set[i][j] = true
 371                         }
 372                 }
 373         andLoop:
 374                 for {
 375                         var token string
 376                         scan.Scan() // Must exist.
 377                         switch class := scan.Text(); class {
 378                         case "t":
 379                                 class = "w" // equal to w for t == 0
 380                                 fallthrough
 381                         case "n", "i", "f", "v", "w":
 382                                 op := scanToken(scan)
 383                                 opCode := operandIndex[class]
 384                                 mod := 0
 385                                 if op == "%" {
 386                                         opCode |= opMod
 387
 388                                         switch v := scanUint(scan); v {
 389                                         case 10, 100:
 390                                                 mod = v
 391                                         case 1000:
 392                                                 // A more general solution would be to allow checking
 393                                                 // against multiples of 100 and include entries for the
 394                                                 // numbers 100..900 in the inclusion masks. At the
 395                                                 // moment this would only help Azerbaijan and Italian.
 396
 397                                                 // Italian doesn't use '%', so this must be Azerbaijan.
 398                                                 cond.used[opAzerbaijan00s] = true
 399                                                 return append(conds, cond)
 400
 401                                         case 1000000:
 402                                                 cond.used[opBretonM] = true
 403                                                 return append(conds, cond)
 404
 405                                         default:
 406                                                 log.Fatalf("Modulo value not supported %d", v)
 407                                         }
 408                                         op = scanToken(scan)
 409                                 }
 410                                 if op != "=" && op != "!=" {
 411                                         log.Fatalf("Unexpected op %q", op)
 412                                 }
 413                                 if op == "!=" {
 414                                         opCode |= opNotEqual
 415                                 }
 416                                 a := []int{}
 417                                 v := scanUint(scan)
 418                                 if class == "w" && v != 0 {
 419                                         log.Fatalf("Must compare against zero for operand type %q", class)
 420                                 }
 421                                 token = scanToken(scan)
 422                                 for {
 423                                         switch token {
 424                                         case "..":
 425                                                 end := scanUint(scan)
 426                                                 for ; v <= end; v++ {
 427                                                         a = append(a, v)
 428                                                 }
 429                                                 token = scanToken(scan)
 430                                         default: // ",", "or", "and", "@..."
 431                                                 a = append(a, v)
 432                                         }
 433                                         if token != "," {
 434                                                 break
 435                                         }
 436                                         v = scanUint(scan)
 437                                         token = scanToken(scan)
 438                                 }
 439                                 if !cond.add(opCode, mod, a) {
 440                                         // Detected large numbers. As we ruled out Azerbaijan, this
 441                                         // must be the many rule for Italian ordinals.
 442                                         cond.set[opItalian800] = cond.set[opN]
 443                                         cond.used[opItalian800] = true
 444                                 }
 445
 446                         case "@integer", "@decimal": // "other" entry: tests only.
 447                                 return conds
 448                         default:
 449                                 log.Fatalf("Unexpected operand class %q (%s)", class, s)
 450                         }
 451                         switch token {
 452                         case "or":
 453                                 conds = append(conds, cond)
 454                                 break andLoop
 455                         case "@integer", "@decimal": // examples
 456                                 // There is always an example in practice, so we always terminate here.
 457                                 if err := scan.Err(); err != nil {
 458                                         log.Fatal(err)
 459                                 }
 460                                 return append(conds, cond)
 461                         case "and":
 462                                 // keep accumulating
 463                         default:
 464                                 log.Fatalf("Unexpected token %q", token)
 465                         }
 466                 }
 467         }
 468 }
 469
 470 func scanToken(scan *bufio.Scanner) string {
 471         scan.Scan()
 472         return scan.Text()
 473 }
 474
 475 func scanUint(scan *bufio.Scanner) int {
 476         scan.Scan()
 477         val, err := strconv.ParseUint(scan.Text(), 10, 32)
 478         if err != nil {
 479                 log.Fatal(err)
 480         }
 481         return int(val)
 482 }
 483
 484 // splitTokens can be used with bufio.Scanner to tokenize CLDR plural rules.
 485 func splitTokens(data []byte, atEOF bool) (advance int, token []byte, err error) {
 486         condTokens := [][]byte{
 487                 []byte(".."),
 488                 []byte(","),
 489                 []byte("!="),
 490                 []byte("="),
 491         }
 492         advance, token, err = bufio.ScanWords(data, atEOF)
 493         for _, t := range condTokens {
 494                 if len(t) >= len(token) {
 495                         continue
 496                 }
 497                 switch p := bytes.Index(token, t); {
 498                 case p == -1:
 499                 case p == 0:
 500                         advance = len(t)
 501                         token = token[:len(t)]
 502                         return advance - len(token) + len(t), token[:len(t)], err
 503                 case p < advance:
 504                         // Don't split when "=" overlaps "!=".
 505                         if t[0] == '=' && token[p-1] == '!' {
 506                                 continue
 507                         }
 508                         advance = p
 509                         token = token[:p]
 510                 }
 511         }
 512         return advance, token, err
 513 }