1 // Copyright 2016 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
9 // This file generates data for the CLDR plural rules, as defined in
10 // http://unicode.org/reports/tr35/tr35-numbers.html#Language_Plural_Rules
12 // We assume a slightly simplified grammar:
14 // condition = and_condition ('or' and_condition)* samples
15 // and_condition = relation ('and' relation)*
16 // relation = expr ('=' | '!=') range_list
17 // expr = operand ('%' '10' '0'* )?
18 // operand = 'n' | 'i' | 'f' | 't' | 'v' | 'w'
19 // range_list = (range | value) (',' range_list)*
20 // range = value'..'value
22 // digit = 0|1|2|3|4|5|6|7|8|9
24 // samples = ('@integer' sampleList)?
25 // ('@decimal' sampleList)?
26 // sampleList = sampleRange (',' sampleRange)* (',' ('…'|'...'))?
27 // sampleRange = decimalValue ('~' decimalValue)?
28 // decimalValue = value ('.' value)?
31 // n absolute value of the source number (integer and decimals).
32 // i integer digits of n.
33 // v number of visible fraction digits in n, with trailing zeros.
34 // w number of visible fraction digits in n, without trailing zeros.
35 // f visible fractional digits in n, with trailing zeros.
36 // t visible fractional digits in n, without trailing zeros.
38 // The algorithm for which the data is generated is based on the following
41 // - the number of different sets of numbers which the plural rules use to
42 // test inclusion is limited,
43 // - most numbers that are tested on are < 100
45 // This allows us to define a bitmap for each number < 100 where a bit i
46 // indicates whether this number is included in some defined set i.
47 // The function matchPlural in plural.go defines how we can subsequently use
48 // this data to determine inclusion.
50 // There are a few languages for which this doesn't work. For one Italian and
51 // Azerbaijan, which both test against numbers > 100 for ordinals and Breton,
52 // which considers whether numbers are multiples of hundreds. The model here
53 // could be extended to handle Italian and Azerbaijan fairly easily (by
54 // considering the numbers 100, 200, 300, ..., 800, 900 in addition to the first
55 // 100), but for now it seems easier to just hard-code these cases.
66 "golang.org/x/text/internal"
67 "golang.org/x/text/internal/gen"
68 "golang.org/x/text/language"
69 "golang.org/x/text/unicode/cldr"
73 test = flag.Bool("test", false,
74 "test existing tables; can be used to compare web data with package data.")
75 outputFile = flag.String("output", "tables.go", "output file")
76 outputTestFile = flag.String("testoutput", "data_test.go", "output file")
78 draft = flag.String("draft",
80 `Minimal draft requirements (approved, contributed, provisional, unconfirmed).`)
88 gen.Repackage("gen_common.go", "common.go", pkg)
89 // Read the CLDR zip file.
90 r := gen.OpenCLDRCoreZip()
94 d.SetDirFilter("supplemental", "main")
95 d.SetSectionFilter("numbers", "plurals")
96 data, err := d.DecodeZip(r)
98 log.Fatalf("DecodeZip: %v", err)
101 w := gen.NewCodeWriter()
102 defer w.WriteGoFile(*outputFile, pkg)
104 gen.WriteCLDRVersion(w)
108 w = gen.NewCodeWriter()
109 defer w.WriteGoFile(*outputTestFile, pkg)
111 genPluralsTests(w, data)
114 type pluralTest struct {
115 locales string // space-separated list of locales for this test
116 form int // Use int instead of Form to simplify generation.
117 integer []string // Entries of the form \d+ or \d+~\d+
118 decimal []string // Entries of the form \f+ or \f+ +~\f+, where f is \d+\.\d+
121 func genPluralsTests(w *gen.CodeWriter, data *cldr.CLDR) {
122 w.WriteType(pluralTest{})
124 for _, plurals := range data.Supplemental().Plurals {
125 if plurals.Type == "" {
126 // The empty type is reserved for plural ranges.
129 tests := []pluralTest{}
131 for _, pRules := range plurals.PluralRules {
132 for _, rule := range pRules.PluralRule {
134 locales: pRules.Locales,
135 form: int(countMap[rule.Count]),
137 scan := bufio.NewScanner(strings.NewReader(rule.Data()))
138 scan.Split(splitTokens)
141 switch t := scan.Text(); t {
153 tests = append(tests, test)
156 w.WriteVar(plurals.Type+"Tests", tests)
160 func genPlurals(w *gen.CodeWriter, data *cldr.CLDR) {
161 for _, plurals := range data.Supplemental().Plurals {
162 if plurals.Type == "" {
165 // Initialize setMap and inclusionMasks. They are already populated with
166 // a few entries to serve as an example and to assign nice numbers to
169 // setMap contains sets of numbers represented by boolean arrays where
170 // a true value for element i means that the number i is included.
171 setMap := map[[numN]bool]int{
172 // The above init func adds an entry for including all numbers.
173 [numN]bool{1: true}: 1, // fix {1} to a nice value
174 [numN]bool{2: true}: 2, // fix {2} to a nice value
175 [numN]bool{0: true}: 3, // fix {0} to a nice value
178 // inclusionMasks contains bit masks for every number under numN to
179 // indicate in which set the number is included. Bit 1 << x will be set
180 // if it is included in set x.
181 inclusionMasks := [numN]uint64{
182 // Note: these entries are not complete: more bits will be set along the way.
188 // Create set {0..99}. We will assign this set the identifier 0.
191 // Mark number i as being included in the set (which has identifier 0).
192 inclusionMasks[i] |= 1 << 0
193 // Mark number i as included in the set.
196 // Register the identifier for the set.
199 rules := []pluralCheck{}
201 langMap := map[int]byte{0: 0} // From compact language index to index
203 for _, pRules := range plurals.PluralRules {
205 var conds []orCondition
206 for _, rule := range pRules.PluralRule {
207 form := countMap[rule.Count]
208 conds = parsePluralCondition(conds, rule.Data(), form)
211 for _, c := range conds {
212 // If an or condition only has filters, we create an entry for
213 // this filter and the set that contains all values.
215 for _, b := range c.used {
219 rules = append(rules, pluralCheck{
220 cat: byte(opMod<<opShift) | byte(c.form),
221 setID: 0, // all values
225 // We have some entries with values.
226 for i, set := range c.set {
230 index, ok := setMap[set]
234 for i := range inclusionMasks {
236 inclusionMasks[i] |= 1 << uint64(index)
240 rules = append(rules, pluralCheck{
241 cat: byte(i<<opShift | andNext),
245 // Now set the last entry to the plural form the rule matches.
246 rules[len(rules)-1].cat &^= formMask
247 rules[len(rules)-1].cat |= byte(c.form)
249 // Point the relevant locales to the created entries.
250 for _, loc := range strings.Split(pRules.Locales, " ") {
251 if strings.TrimSpace(loc) == "" {
254 lang, ok := language.CompactIndex(language.MustParse(loc))
256 log.Printf("No compact index for locale %q", loc)
258 langMap[lang] = byte(len(index) - 1)
260 index = append(index, byte(len(rules)))
262 w.WriteVar(plurals.Type+"Rules", rules)
263 w.WriteVar(plurals.Type+"Index", index)
264 // Expand the values.
265 langToIndex := make([]byte, language.NumCompactTags)
266 for i := range langToIndex {
267 for p := i; ; p = int(internal.Parent[p]) {
268 if x, ok := langMap[p]; ok {
274 w.WriteVar(plurals.Type+"LangToIndex", langToIndex)
275 // Need to convert array to slice because of golang.org/issue/7651.
276 // This will allow tables to be dropped when unused. This is especially
277 // relevant for the ordinal data, which I suspect won't be used as much.
278 w.WriteVar(plurals.Type+"InclusionMasks", inclusionMasks[:])
280 if len(rules) > 0xFF {
281 log.Fatalf("Too many entries for rules: %#x", len(rules))
283 if len(index) > 0xFF {
284 log.Fatalf("Too many entries for index: %#x", len(index))
286 if len(setMap) > 64 { // maximum number of bits.
287 log.Fatalf("Too many entries for setMap: %d", len(setMap))
290 "Slots used for %s: %X of 0xFF rules; %X of 0xFF indexes; %d of 64 sets",
291 plurals.Type, len(rules), len(index), len(setMap))
292 // Prevent comment from attaching to the next entry.
293 fmt.Fprint(w, "\n\n")
297 type orCondition struct {
298 original string // for debugging
305 func (o *orCondition) add(op opID, mod int, v []int) (ok bool) {
307 for _, x := range v {
313 for i := 0; i < numN; i++ {
328 func intIn(x int, a []int) bool {
329 for _, y := range a {
337 var operandIndex = map[string]opID{
345 // parsePluralCondition parses the condition of a single pluralRule and appends
346 // the resulting or conditions to conds.
349 // // Category "one" in English: only allow 1 with no visible fraction
350 // i = 1 and v = 0 @integer 1
352 // // Category "few" in Czech: all numbers with visible fractions
353 // v != 0 @decimal ...
355 // // Category "zero" in Latvian: all multiples of 10 or the numbers 11-19 or
356 // // numbers with a fraction 11..19 and no trailing zeros.
357 // n % 10 = 0 or n % 100 = 11..19 or v = 2 and f % 100 = 11..19 @integer ...
359 // @integer and @decimal are followed by examples and are not relevant for the
360 // rule itself. The are used here to signal the termination of the rule.
361 func parsePluralCondition(conds []orCondition, s string, f Form) []orCondition {
362 scan := bufio.NewScanner(strings.NewReader(s))
363 scan.Split(splitTokens)
365 cond := orCondition{original: s, form: f}
366 // Set all numbers to be allowed for all number classes and restrict
368 for i := range cond.set {
369 for j := range cond.set[i] {
370 cond.set[i][j] = true
376 scan.Scan() // Must exist.
377 switch class := scan.Text(); class {
379 class = "w" // equal to w for t == 0
381 case "n", "i", "f", "v", "w":
382 op := scanToken(scan)
383 opCode := operandIndex[class]
388 switch v := scanUint(scan); v {
392 // A more general solution would be to allow checking
393 // against multiples of 100 and include entries for the
394 // numbers 100..900 in the inclusion masks. At the
395 // moment this would only help Azerbaijan and Italian.
397 // Italian doesn't use '%', so this must be Azerbaijan.
398 cond.used[opAzerbaijan00s] = true
399 return append(conds, cond)
402 cond.used[opBretonM] = true
403 return append(conds, cond)
406 log.Fatalf("Modulo value not supported %d", v)
410 if op != "=" && op != "!=" {
411 log.Fatalf("Unexpected op %q", op)
418 if class == "w" && v != 0 {
419 log.Fatalf("Must compare against zero for operand type %q", class)
421 token = scanToken(scan)
425 end := scanUint(scan)
426 for ; v <= end; v++ {
429 token = scanToken(scan)
430 default: // ",", "or", "and", "@..."
437 token = scanToken(scan)
439 if !cond.add(opCode, mod, a) {
440 // Detected large numbers. As we ruled out Azerbaijan, this
441 // must be the many rule for Italian ordinals.
442 cond.set[opItalian800] = cond.set[opN]
443 cond.used[opItalian800] = true
446 case "@integer", "@decimal": // "other" entry: tests only.
449 log.Fatalf("Unexpected operand class %q (%s)", class, s)
453 conds = append(conds, cond)
455 case "@integer", "@decimal": // examples
456 // There is always an example in practice, so we always terminate here.
457 if err := scan.Err(); err != nil {
460 return append(conds, cond)
464 log.Fatalf("Unexpected token %q", token)
470 func scanToken(scan *bufio.Scanner) string {
475 func scanUint(scan *bufio.Scanner) int {
477 val, err := strconv.ParseUint(scan.Text(), 10, 32)
484 // splitTokens can be used with bufio.Scanner to tokenize CLDR plural rules.
485 func splitTokens(data []byte, atEOF bool) (advance int, token []byte, err error) {
486 condTokens := [][]byte{
492 advance, token, err = bufio.ScanWords(data, atEOF)
493 for _, t := range condTokens {
494 if len(t) >= len(token) {
497 switch p := bytes.Index(token, t); {
501 token = token[:len(t)]
502 return advance - len(token) + len(t), token[:len(t)], err
504 // Don't split when "=" overlaps "!=".
505 if t[0] == '=' && token[p-1] == '!' {
512 return advance, token, err