vendor/golang.org/x/text/language/gen.go

   1 // Copyright 2013 The Go Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style
   3 // license that can be found in the LICENSE file.
   4
   5 // +build ignore
   6
   7 // Language tag table generator.
   8 // Data read from the web.
   9
  10 package main
  11
  12 import (
  13         "bufio"
  14         "flag"
  15         "fmt"
  16         "io"
  17         "io/ioutil"
  18         "log"
  19         "math"
  20         "reflect"
  21         "regexp"
  22         "sort"
  23         "strconv"
  24         "strings"
  25
  26         "golang.org/x/text/internal/gen"
  27         "golang.org/x/text/internal/tag"
  28         "golang.org/x/text/unicode/cldr"
  29 )
  30
  31 var (
  32         test = flag.Bool("test",
  33                 false,
  34                 "test existing tables; can be used to compare web data with package data.")
  35         outputFile = flag.String("output",
  36                 "tables.go",
  37                 "output file for generated tables")
  38 )
  39
  40 var comment = []string{
  41         `
  42 lang holds an alphabetically sorted list of ISO-639 language identifiers.
  43 All entries are 4 bytes. The index of the identifier (divided by 4) is the language tag.
  44 For 2-byte language identifiers, the two successive bytes have the following meaning:
  45     - if the first letter of the 2- and 3-letter ISO codes are the same:
  46       the second and third letter of the 3-letter ISO code.
  47     - otherwise: a 0 and a by 2 bits right-shifted index into altLangISO3.
  48 For 3-byte language identifiers the 4th byte is 0.`,
  49         `
  50 langNoIndex is a bit vector of all 3-letter language codes that are not used as an index
  51 in lookup tables. The language ids for these language codes are derived directly
  52 from the letters and are not consecutive.`,
  53         `
  54 altLangISO3 holds an alphabetically sorted list of 3-letter language code alternatives
  55 to 2-letter language codes that cannot be derived using the method described above.
  56 Each 3-letter code is followed by its 1-byte langID.`,
  57         `
  58 altLangIndex is used to convert indexes in altLangISO3 to langIDs.`,
  59         `
  60 langAliasMap maps langIDs to their suggested replacements.`,
  61         `
  62 script is an alphabetically sorted list of ISO 15924 codes. The index
  63 of the script in the string, divided by 4, is the internal scriptID.`,
  64         `
  65 isoRegionOffset needs to be added to the index of regionISO to obtain the regionID
  66 for 2-letter ISO codes. (The first isoRegionOffset regionIDs are reserved for
  67 the UN.M49 codes used for groups.)`,
  68         `
  69 regionISO holds a list of alphabetically sorted 2-letter ISO region codes.
  70 Each 2-letter codes is followed by two bytes with the following meaning:
  71     - [A-Z}{2}: the first letter of the 2-letter code plus these two
  72                 letters form the 3-letter ISO code.
  73     - 0, n:     index into altRegionISO3.`,
  74         `
  75 regionTypes defines the status of a region for various standards.`,
  76         `
  77 m49 maps regionIDs to UN.M49 codes. The first isoRegionOffset entries are
  78 codes indicating collections of regions.`,
  79         `
  80 m49Index gives indexes into fromM49 based on the three most significant bits
  81 of a 10-bit UN.M49 code. To search an UN.M49 code in fromM49, search in
  82    fromM49[m49Index[msb39(code)]:m49Index[msb3(code)+1]]
  83 for an entry where the first 7 bits match the 7 lsb of the UN.M49 code.
  84 The region code is stored in the 9 lsb of the indexed value.`,
  85         `
  86 fromM49 contains entries to map UN.M49 codes to regions. See m49Index for details.`,
  87         `
  88 altRegionISO3 holds a list of 3-letter region codes that cannot be
  89 mapped to 2-letter codes using the default algorithm. This is a short list.`,
  90         `
  91 altRegionIDs holds a list of regionIDs the positions of which match those
  92 of the 3-letter ISO codes in altRegionISO3.`,
  93         `
  94 variantNumSpecialized is the number of specialized variants in variants.`,
  95         `
  96 suppressScript is an index from langID to the dominant script for that language,
  97 if it exists.  If a script is given, it should be suppressed from the language tag.`,
  98         `
  99 likelyLang is a lookup table, indexed by langID, for the most likely
 100 scripts and regions given incomplete information. If more entries exist for a
 101 given language, region and script are the index and size respectively
 102 of the list in likelyLangList.`,
 103         `
 104 likelyLangList holds lists info associated with likelyLang.`,
 105         `
 106 likelyRegion is a lookup table, indexed by regionID, for the most likely
 107 languages and scripts given incomplete information. If more entries exist
 108 for a given regionID, lang and script are the index and size respectively
 109 of the list in likelyRegionList.
 110 TODO: exclude containers and user-definable regions from the list.`,
 111         `
 112 likelyRegionList holds lists info associated with likelyRegion.`,
 113         `
 114 likelyScript is a lookup table, indexed by scriptID, for the most likely
 115 languages and regions given a script.`,
 116         `
 117 matchLang holds pairs of langIDs of base languages that are typically
 118 mutually intelligible. Each pair is associated with a confidence and
 119 whether the intelligibility goes one or both ways.`,
 120         `
 121 matchScript holds pairs of scriptIDs where readers of one script
 122 can typically also read the other. Each is associated with a confidence.`,
 123         `
 124 nRegionGroups is the number of region groups.`,
 125         `
 126 regionInclusion maps region identifiers to sets of regions in regionInclusionBits,
 127 where each set holds all groupings that are directly connected in a region
 128 containment graph.`,
 129         `
 130 regionInclusionBits is an array of bit vectors where every vector represents
 131 a set of region groupings.  These sets are used to compute the distance
 132 between two regions for the purpose of language matching.`,
 133         `
 134 regionInclusionNext marks, for each entry in regionInclusionBits, the set of
 135 all groups that are reachable from the groups set in the respective entry.`,
 136 }
 137
 138 // TODO: consider changing some of these structures to tries. This can reduce
 139 // memory, but may increase the need for memory allocations. This could be
 140 // mitigated if we can piggyback on language tags for common cases.
 141
 142 func failOnError(e error) {
 143         if e != nil {
 144                 log.Panic(e)
 145         }
 146 }
 147
 148 type setType int
 149
 150 const (
 151         Indexed setType = 1 + iota // all elements must be of same size
 152         Linear
 153 )
 154
 155 type stringSet struct {
 156         s              []string
 157         sorted, frozen bool
 158
 159         // We often need to update values after the creation of an index is completed.
 160         // We include a convenience map for keeping track of this.
 161         update map[string]string
 162         typ    setType // used for checking.
 163 }
 164
 165 func (ss *stringSet) clone() stringSet {
 166         c := *ss
 167         c.s = append([]string(nil), c.s...)
 168         return c
 169 }
 170
 171 func (ss *stringSet) setType(t setType) {
 172         if ss.typ != t && ss.typ != 0 {
 173                 log.Panicf("type %d cannot be assigned as it was already %d", t, ss.typ)
 174         }
 175 }
 176
 177 // parse parses a whitespace-separated string and initializes ss with its
 178 // components.
 179 func (ss *stringSet) parse(s string) {
 180         scan := bufio.NewScanner(strings.NewReader(s))
 181         scan.Split(bufio.ScanWords)
 182         for scan.Scan() {
 183                 ss.add(scan.Text())
 184         }
 185 }
 186
 187 func (ss *stringSet) assertChangeable() {
 188         if ss.frozen {
 189                 log.Panic("attempt to modify a frozen stringSet")
 190         }
 191 }
 192
 193 func (ss *stringSet) add(s string) {
 194         ss.assertChangeable()
 195         ss.s = append(ss.s, s)
 196         ss.sorted = ss.frozen
 197 }
 198
 199 func (ss *stringSet) freeze() {
 200         ss.compact()
 201         ss.frozen = true
 202 }
 203
 204 func (ss *stringSet) compact() {
 205         if ss.sorted {
 206                 return
 207         }
 208         a := ss.s
 209         sort.Strings(a)
 210         k := 0
 211         for i := 1; i < len(a); i++ {
 212                 if a[k] != a[i] {
 213                         a[k+1] = a[i]
 214                         k++
 215                 }
 216         }
 217         ss.s = a[:k+1]
 218         ss.sorted = ss.frozen
 219 }
 220
 221 type funcSorter struct {
 222         fn func(a, b string) bool
 223         sort.StringSlice
 224 }
 225
 226 func (s funcSorter) Less(i, j int) bool {
 227         return s.fn(s.StringSlice[i], s.StringSlice[j])
 228 }
 229
 230 func (ss *stringSet) sortFunc(f func(a, b string) bool) {
 231         ss.compact()
 232         sort.Sort(funcSorter{f, sort.StringSlice(ss.s)})
 233 }
 234
 235 func (ss *stringSet) remove(s string) {
 236         ss.assertChangeable()
 237         if i, ok := ss.find(s); ok {
 238                 copy(ss.s[i:], ss.s[i+1:])
 239                 ss.s = ss.s[:len(ss.s)-1]
 240         }
 241 }
 242
 243 func (ss *stringSet) replace(ol, nu string) {
 244         ss.s[ss.index(ol)] = nu
 245         ss.sorted = ss.frozen
 246 }
 247
 248 func (ss *stringSet) index(s string) int {
 249         ss.setType(Indexed)
 250         i, ok := ss.find(s)
 251         if !ok {
 252                 if i < len(ss.s) {
 253                         log.Panicf("find: item %q is not in list. Closest match is %q.", s, ss.s[i])
 254                 }
 255                 log.Panicf("find: item %q is not in list", s)
 256
 257         }
 258         return i
 259 }
 260
 261 func (ss *stringSet) find(s string) (int, bool) {
 262         ss.compact()
 263         i := sort.SearchStrings(ss.s, s)
 264         return i, i != len(ss.s) && ss.s[i] == s
 265 }
 266
 267 func (ss *stringSet) slice() []string {
 268         ss.compact()
 269         return ss.s
 270 }
 271
 272 func (ss *stringSet) updateLater(v, key string) {
 273         if ss.update == nil {
 274                 ss.update = map[string]string{}
 275         }
 276         ss.update[v] = key
 277 }
 278
 279 // join joins the string and ensures that all entries are of the same length.
 280 func (ss *stringSet) join() string {
 281         ss.setType(Indexed)
 282         n := len(ss.s[0])
 283         for _, s := range ss.s {
 284                 if len(s) != n {
 285                         log.Panicf("join: not all entries are of the same length: %q", s)
 286                 }
 287         }
 288         ss.s = append(ss.s, strings.Repeat("\xff", n))
 289         return strings.Join(ss.s, "")
 290 }
 291
 292 // ianaEntry holds information for an entry in the IANA Language Subtag Repository.
 293 // All types use the same entry.
 294 // See http://tools.ietf.org/html/bcp47#section-5.1 for a description of the various
 295 // fields.
 296 type ianaEntry struct {
 297         typ            string
 298         description    []string
 299         scope          string
 300         added          string
 301         preferred      string
 302         deprecated     string
 303         suppressScript string
 304         macro          string
 305         prefix         []string
 306 }
 307
 308 type builder struct {
 309         w    *gen.CodeWriter
 310         hw   io.Writer // MultiWriter for w and w.Hash
 311         data *cldr.CLDR
 312         supp *cldr.SupplementalData
 313
 314         // indices
 315         locale      stringSet // common locales
 316         lang        stringSet // canonical language ids (2 or 3 letter ISO codes) with data
 317         langNoIndex stringSet // 3-letter ISO codes with no associated data
 318         script      stringSet // 4-letter ISO codes
 319         region      stringSet // 2-letter ISO or 3-digit UN M49 codes
 320         variant     stringSet // 4-8-alphanumeric variant code.
 321
 322         // Region codes that are groups with their corresponding group IDs.
 323         groups map[int]index
 324
 325         // langInfo
 326         registry map[string]*ianaEntry
 327 }
 328
 329 type index uint
 330
 331 func newBuilder(w *gen.CodeWriter) *builder {
 332         r := gen.OpenCLDRCoreZip()
 333         defer r.Close()
 334         d := &cldr.Decoder{}
 335         data, err := d.DecodeZip(r)
 336         failOnError(err)
 337         b := builder{
 338                 w:    w,
 339                 hw:   io.MultiWriter(w, w.Hash),
 340                 data: data,
 341                 supp: data.Supplemental(),
 342         }
 343         b.parseRegistry()
 344         return &b
 345 }
 346
 347 func (b *builder) parseRegistry() {
 348         r := gen.OpenIANAFile("assignments/language-subtag-registry")
 349         defer r.Close()
 350         b.registry = make(map[string]*ianaEntry)
 351
 352         scan := bufio.NewScanner(r)
 353         scan.Split(bufio.ScanWords)
 354         var record *ianaEntry
 355         for more := scan.Scan(); more; {
 356                 key := scan.Text()
 357                 more = scan.Scan()
 358                 value := scan.Text()
 359                 switch key {
 360                 case "Type:":
 361                         record = &ianaEntry{typ: value}
 362                 case "Subtag:", "Tag:":
 363                         if s := strings.SplitN(value, "..", 2); len(s) > 1 {
 364                                 for a := s[0]; a <= s[1]; a = inc(a) {
 365                                         b.addToRegistry(a, record)
 366                                 }
 367                         } else {
 368                                 b.addToRegistry(value, record)
 369                         }
 370                 case "Suppress-Script:":
 371                         record.suppressScript = value
 372                 case "Added:":
 373                         record.added = value
 374                 case "Deprecated:":
 375                         record.deprecated = value
 376                 case "Macrolanguage:":
 377                         record.macro = value
 378                 case "Preferred-Value:":
 379                         record.preferred = value
 380                 case "Prefix:":
 381                         record.prefix = append(record.prefix, value)
 382                 case "Scope:":
 383                         record.scope = value
 384                 case "Description:":
 385                         buf := []byte(value)
 386                         for more = scan.Scan(); more; more = scan.Scan() {
 387                                 b := scan.Bytes()
 388                                 if b[0] == '%' || b[len(b)-1] == ':' {
 389                                         break
 390                                 }
 391                                 buf = append(buf, ' ')
 392                                 buf = append(buf, b...)
 393                         }
 394                         record.description = append(record.description, string(buf))
 395                         continue
 396                 default:
 397                         continue
 398                 }
 399                 more = scan.Scan()
 400         }
 401         if scan.Err() != nil {
 402                 log.Panic(scan.Err())
 403         }
 404 }
 405
 406 func (b *builder) addToRegistry(key string, entry *ianaEntry) {
 407         if info, ok := b.registry[key]; ok {
 408                 if info.typ != "language" || entry.typ != "extlang" {
 409                         log.Fatalf("parseRegistry: tag %q already exists", key)
 410                 }
 411         } else {
 412                 b.registry[key] = entry
 413         }
 414 }
 415
 416 var commentIndex = make(map[string]string)
 417
 418 func init() {
 419         for _, s := range comment {
 420                 key := strings.TrimSpace(strings.SplitN(s, " ", 2)[0])
 421                 commentIndex[key] = s
 422         }
 423 }
 424
 425 func (b *builder) comment(name string) {
 426         if s := commentIndex[name]; len(s) > 0 {
 427                 b.w.WriteComment(s)
 428         } else {
 429                 fmt.Fprintln(b.w)
 430         }
 431 }
 432
 433 func (b *builder) pf(f string, x ...interface{}) {
 434         fmt.Fprintf(b.hw, f, x...)
 435         fmt.Fprint(b.hw, "\n")
 436 }
 437
 438 func (b *builder) p(x ...interface{}) {
 439         fmt.Fprintln(b.hw, x...)
 440 }
 441
 442 func (b *builder) addSize(s int) {
 443         b.w.Size += s
 444         b.pf("// Size: %d bytes", s)
 445 }
 446
 447 func (b *builder) writeConst(name string, x interface{}) {
 448         b.comment(name)
 449         b.w.WriteConst(name, x)
 450 }
 451
 452 // writeConsts computes f(v) for all v in values and writes the results
 453 // as constants named _v to a single constant block.
 454 func (b *builder) writeConsts(f func(string) int, values ...string) {
 455         b.pf("const (")
 456         for _, v := range values {
 457                 b.pf("\t_%s = %v", v, f(v))
 458         }
 459         b.pf(")")
 460 }
 461
 462 // writeType writes the type of the given value, which must be a struct.
 463 func (b *builder) writeType(value interface{}) {
 464         b.comment(reflect.TypeOf(value).Name())
 465         b.w.WriteType(value)
 466 }
 467
 468 func (b *builder) writeSlice(name string, ss interface{}) {
 469         b.writeSliceAddSize(name, 0, ss)
 470 }
 471
 472 func (b *builder) writeSliceAddSize(name string, extraSize int, ss interface{}) {
 473         b.comment(name)
 474         b.w.Size += extraSize
 475         v := reflect.ValueOf(ss)
 476         t := v.Type().Elem()
 477         b.pf("// Size: %d bytes, %d elements", v.Len()*int(t.Size())+extraSize, v.Len())
 478
 479         fmt.Fprintf(b.w, "var %s = ", name)
 480         b.w.WriteArray(ss)
 481         b.p()
 482 }
 483
 484 type fromTo struct {
 485         from, to uint16
 486 }
 487
 488 func (b *builder) writeSortedMap(name string, ss *stringSet, index func(s string) uint16) {
 489         ss.sortFunc(func(a, b string) bool {
 490                 return index(a) < index(b)
 491         })
 492         m := []fromTo{}
 493         for _, s := range ss.s {
 494                 m = append(m, fromTo{index(s), index(ss.update[s])})
 495         }
 496         b.writeSlice(name, m)
 497 }
 498
 499 const base = 'z' - 'a' + 1
 500
 501 func strToInt(s string) uint {
 502         v := uint(0)
 503         for i := 0; i < len(s); i++ {
 504                 v *= base
 505                 v += uint(s[i] - 'a')
 506         }
 507         return v
 508 }
 509
 510 // converts the given integer to the original ASCII string passed to strToInt.
 511 // len(s) must match the number of characters obtained.
 512 func intToStr(v uint, s []byte) {
 513         for i := len(s) - 1; i >= 0; i-- {
 514                 s[i] = byte(v%base) + 'a'
 515                 v /= base
 516         }
 517 }
 518
 519 func (b *builder) writeBitVector(name string, ss []string) {
 520         vec := make([]uint8, int(math.Ceil(math.Pow(base, float64(len(ss[0])))/8)))
 521         for _, s := range ss {
 522                 v := strToInt(s)
 523                 vec[v/8] |= 1 << (v % 8)
 524         }
 525         b.writeSlice(name, vec)
 526 }
 527
 528 // TODO: convert this type into a list or two-stage trie.
 529 func (b *builder) writeMapFunc(name string, m map[string]string, f func(string) uint16) {
 530         b.comment(name)
 531         v := reflect.ValueOf(m)
 532         sz := v.Len() * (2 + int(v.Type().Key().Size()))
 533         for _, k := range m {
 534                 sz += len(k)
 535         }
 536         b.addSize(sz)
 537         keys := []string{}
 538         b.pf(`var %s = map[string]uint16{`, name)
 539         for k := range m {
 540                 keys = append(keys, k)
 541         }
 542         sort.Strings(keys)
 543         for _, k := range keys {
 544                 b.pf("\t%q: %v,", k, f(m[k]))
 545         }
 546         b.p("}")
 547 }
 548
 549 func (b *builder) writeMap(name string, m interface{}) {
 550         b.comment(name)
 551         v := reflect.ValueOf(m)
 552         sz := v.Len() * (2 + int(v.Type().Key().Size()) + int(v.Type().Elem().Size()))
 553         b.addSize(sz)
 554         f := strings.FieldsFunc(fmt.Sprintf("%#v", m), func(r rune) bool {
 555                 return strings.IndexRune("{}, ", r) != -1
 556         })
 557         sort.Strings(f[1:])
 558         b.pf(`var %s = %s{`, name, f[0])
 559         for _, kv := range f[1:] {
 560                 b.pf("\t%s,", kv)
 561         }
 562         b.p("}")
 563 }
 564
 565 func (b *builder) langIndex(s string) uint16 {
 566         if s == "und" {
 567                 return 0
 568         }
 569         if i, ok := b.lang.find(s); ok {
 570                 return uint16(i)
 571         }
 572         return uint16(strToInt(s)) + uint16(len(b.lang.s))
 573 }
 574
 575 // inc advances the string to its lexicographical successor.
 576 func inc(s string) string {
 577         const maxTagLength = 4
 578         var buf [maxTagLength]byte
 579         intToStr(strToInt(strings.ToLower(s))+1, buf[:len(s)])
 580         for i := 0; i < len(s); i++ {
 581                 if s[i] <= 'Z' {
 582                         buf[i] -= 'a' - 'A'
 583                 }
 584         }
 585         return string(buf[:len(s)])
 586 }
 587
 588 func (b *builder) parseIndices() {
 589         meta := b.supp.Metadata
 590
 591         for k, v := range b.registry {
 592                 var ss *stringSet
 593                 switch v.typ {
 594                 case "language":
 595                         if len(k) == 2 || v.suppressScript != "" || v.scope == "special" {
 596                                 b.lang.add(k)
 597                                 continue
 598                         } else {
 599                                 ss = &b.langNoIndex
 600                         }
 601                 case "region":
 602                         ss = &b.region
 603                 case "script":
 604                         ss = &b.script
 605                 case "variant":
 606                         ss = &b.variant
 607                 default:
 608                         continue
 609                 }
 610                 ss.add(k)
 611         }
 612         // Include any language for which there is data.
 613         for _, lang := range b.data.Locales() {
 614                 if x := b.data.RawLDML(lang); false ||
 615                         x.LocaleDisplayNames != nil ||
 616                         x.Characters != nil ||
 617                         x.Delimiters != nil ||
 618                         x.Measurement != nil ||
 619                         x.Dates != nil ||
 620                         x.Numbers != nil ||
 621                         x.Units != nil ||
 622                         x.ListPatterns != nil ||
 623                         x.Collations != nil ||
 624                         x.Segmentations != nil ||
 625                         x.Rbnf != nil ||
 626                         x.Annotations != nil ||
 627                         x.Metadata != nil {
 628
 629                         from := strings.Split(lang, "_")
 630                         if lang := from[0]; lang != "root" {
 631                                 b.lang.add(lang)
 632                         }
 633                 }
 634         }
 635         // Include locales for plural rules, which uses a different structure.
 636         for _, plurals := range b.data.Supplemental().Plurals {
 637                 for _, rules := range plurals.PluralRules {
 638                         for _, lang := range strings.Split(rules.Locales, " ") {
 639                                 if lang = strings.Split(lang, "_")[0]; lang != "root" {
 640                                         b.lang.add(lang)
 641                                 }
 642                         }
 643                 }
 644         }
 645         // Include languages in likely subtags.
 646         for _, m := range b.supp.LikelySubtags.LikelySubtag {
 647                 from := strings.Split(m.From, "_")
 648                 b.lang.add(from[0])
 649         }
 650         // Include ISO-639 alpha-3 bibliographic entries.
 651         for _, a := range meta.Alias.LanguageAlias {
 652                 if a.Reason == "bibliographic" {
 653                         b.langNoIndex.add(a.Type)
 654                 }
 655         }
 656         // Include regions in territoryAlias (not all are in the IANA registry!)
 657         for _, reg := range b.supp.Metadata.Alias.TerritoryAlias {
 658                 if len(reg.Type) == 2 {
 659                         b.region.add(reg.Type)
 660                 }
 661         }
 662
 663         for _, s := range b.lang.s {
 664                 if len(s) == 3 {
 665                         b.langNoIndex.remove(s)
 666                 }
 667         }
 668         b.writeConst("numLanguages", len(b.lang.slice())+len(b.langNoIndex.slice()))
 669         b.writeConst("numScripts", len(b.script.slice()))
 670         b.writeConst("numRegions", len(b.region.slice()))
 671
 672         // Add dummy codes at the start of each list to represent "unspecified".
 673         b.lang.add("---")
 674         b.script.add("----")
 675         b.region.add("---")
 676
 677         // common locales
 678         b.locale.parse(meta.DefaultContent.Locales)
 679 }
 680
 681 // TODO: region inclusion data will probably not be use used in future matchers.
 682
 683 func (b *builder) computeRegionGroups() {
 684         b.groups = make(map[int]index)
 685
 686         // Create group indices.
 687         for i := 1; b.region.s[i][0] < 'A'; i++ { // Base M49 indices on regionID.
 688                 b.groups[i] = index(len(b.groups))
 689         }
 690         for _, g := range b.supp.TerritoryContainment.Group {
 691                 // Skip UN and EURO zone as they are flattening the containment
 692                 // relationship.
 693                 if g.Type == "EZ" || g.Type == "UN" {
 694                         continue
 695                 }
 696                 group := b.region.index(g.Type)
 697                 if _, ok := b.groups[group]; !ok {
 698                         b.groups[group] = index(len(b.groups))
 699                 }
 700         }
 701         if len(b.groups) > 64 {
 702                 log.Fatalf("only 64 groups supported, found %d", len(b.groups))
 703         }
 704         b.writeConst("nRegionGroups", len(b.groups))
 705 }
 706
 707 var langConsts = []string{
 708         "af", "am", "ar", "az", "bg", "bn", "ca", "cs", "da", "de", "el", "en", "es",
 709         "et", "fa", "fi", "fil", "fr", "gu", "he", "hi", "hr", "hu", "hy", "id", "is",
 710         "it", "ja", "ka", "kk", "km", "kn", "ko", "ky", "lo", "lt", "lv", "mk", "ml",
 711         "mn", "mo", "mr", "ms", "mul", "my", "nb", "ne", "nl", "no", "pa", "pl", "pt",
 712         "ro", "ru", "sh", "si", "sk", "sl", "sq", "sr", "sv", "sw", "ta", "te", "th",
 713         "tl", "tn", "tr", "uk", "ur", "uz", "vi", "zh", "zu",
 714
 715         // constants for grandfathered tags (if not already defined)
 716         "jbo", "ami", "bnn", "hak", "tlh", "lb", "nv", "pwn", "tao", "tay", "tsu",
 717         "nn", "sfb", "vgt", "sgg", "cmn", "nan", "hsn",
 718 }
 719
 720 // writeLanguage generates all tables needed for language canonicalization.
 721 func (b *builder) writeLanguage() {
 722         meta := b.supp.Metadata
 723
 724         b.writeConst("nonCanonicalUnd", b.lang.index("und"))
 725         b.writeConsts(func(s string) int { return int(b.langIndex(s)) }, langConsts...)
 726         b.writeConst("langPrivateStart", b.langIndex("qaa"))
 727         b.writeConst("langPrivateEnd", b.langIndex("qtz"))
 728
 729         // Get language codes that need to be mapped (overlong 3-letter codes,
 730         // deprecated 2-letter codes, legacy and grandfathered tags.)
 731         langAliasMap := stringSet{}
 732         aliasTypeMap := map[string]langAliasType{}
 733
 734         // altLangISO3 get the alternative ISO3 names that need to be mapped.
 735         altLangISO3 := stringSet{}
 736         // Add dummy start to avoid the use of index 0.
 737         altLangISO3.add("---")
 738         altLangISO3.updateLater("---", "aa")
 739
 740         lang := b.lang.clone()
 741         for _, a := range meta.Alias.LanguageAlias {
 742                 if a.Replacement == "" {
 743                         a.Replacement = "und"
 744                 }
 745                 // TODO: support mapping to tags
 746                 repl := strings.SplitN(a.Replacement, "_", 2)[0]
 747                 if a.Reason == "overlong" {
 748                         if len(a.Replacement) == 2 && len(a.Type) == 3 {
 749                                 lang.updateLater(a.Replacement, a.Type)
 750                         }
 751                 } else if len(a.Type) <= 3 {
 752                         switch a.Reason {
 753                         case "macrolanguage":
 754                                 aliasTypeMap[a.Type] = langMacro
 755                         case "deprecated":
 756                                 // handled elsewhere
 757                                 continue
 758                         case "bibliographic", "legacy":
 759                                 if a.Type == "no" {
 760                                         continue
 761                                 }
 762                                 aliasTypeMap[a.Type] = langLegacy
 763                         default:
 764                                 log.Fatalf("new %s alias: %s", a.Reason, a.Type)
 765                         }
 766                         langAliasMap.add(a.Type)
 767                         langAliasMap.updateLater(a.Type, repl)
 768                 }
 769         }
 770         // Manually add the mapping of "nb" (Norwegian) to its macro language.
 771         // This can be removed if CLDR adopts this change.
 772         langAliasMap.add("nb")
 773         langAliasMap.updateLater("nb", "no")
 774         aliasTypeMap["nb"] = langMacro
 775
 776         for k, v := range b.registry {
 777                 // Also add deprecated values for 3-letter ISO codes, which CLDR omits.
 778                 if v.typ == "language" && v.deprecated != "" && v.preferred != "" {
 779                         langAliasMap.add(k)
 780                         langAliasMap.updateLater(k, v.preferred)
 781                         aliasTypeMap[k] = langDeprecated
 782                 }
 783         }
 784         // Fix CLDR mappings.
 785         lang.updateLater("tl", "tgl")
 786         lang.updateLater("sh", "hbs")
 787         lang.updateLater("mo", "mol")
 788         lang.updateLater("no", "nor")
 789         lang.updateLater("tw", "twi")
 790         lang.updateLater("nb", "nob")
 791         lang.updateLater("ak", "aka")
 792         lang.updateLater("bh", "bih")
 793
 794         // Ensure that each 2-letter code is matched with a 3-letter code.
 795         for _, v := range lang.s[1:] {
 796                 s, ok := lang.update[v]
 797                 if !ok {
 798                         if s, ok = lang.update[langAliasMap.update[v]]; !ok {
 799                                 continue
 800                         }
 801                         lang.update[v] = s
 802                 }
 803                 if v[0] != s[0] {
 804                         altLangISO3.add(s)
 805                         altLangISO3.updateLater(s, v)
 806                 }
 807         }
 808
 809         // Complete canonicalized language tags.
 810         lang.freeze()
 811         for i, v := range lang.s {
 812                 // We can avoid these manual entries by using the IANA registry directly.
 813                 // Seems easier to update the list manually, as changes are rare.
 814                 // The panic in this loop will trigger if we miss an entry.
 815                 add := ""
 816                 if s, ok := lang.update[v]; ok {
 817                         if s[0] == v[0] {
 818                                 add = s[1:]
 819                         } else {
 820                                 add = string([]byte{0, byte(altLangISO3.index(s))})
 821                         }
 822                 } else if len(v) == 3 {
 823                         add = "\x00"
 824                 } else {
 825                         log.Panicf("no data for long form of %q", v)
 826                 }
 827                 lang.s[i] += add
 828         }
 829         b.writeConst("lang", tag.Index(lang.join()))
 830
 831         b.writeConst("langNoIndexOffset", len(b.lang.s))
 832
 833         // space of all valid 3-letter language identifiers.
 834         b.writeBitVector("langNoIndex", b.langNoIndex.slice())
 835
 836         altLangIndex := []uint16{}
 837         for i, s := range altLangISO3.slice() {
 838                 altLangISO3.s[i] += string([]byte{byte(len(altLangIndex))})
 839                 if i > 0 {
 840                         idx := b.lang.index(altLangISO3.update[s])
 841                         altLangIndex = append(altLangIndex, uint16(idx))
 842                 }
 843         }
 844         b.writeConst("altLangISO3", tag.Index(altLangISO3.join()))
 845         b.writeSlice("altLangIndex", altLangIndex)
 846
 847         b.writeSortedMap("langAliasMap", &langAliasMap, b.langIndex)
 848         types := make([]langAliasType, len(langAliasMap.s))
 849         for i, s := range langAliasMap.s {
 850                 types[i] = aliasTypeMap[s]
 851         }
 852         b.writeSlice("langAliasTypes", types)
 853 }
 854
 855 var scriptConsts = []string{
 856         "Latn", "Hani", "Hans", "Hant", "Qaaa", "Qaai", "Qabx", "Zinh", "Zyyy",
 857         "Zzzz",
 858 }
 859
 860 func (b *builder) writeScript() {
 861         b.writeConsts(b.script.index, scriptConsts...)
 862         b.writeConst("script", tag.Index(b.script.join()))
 863
 864         supp := make([]uint8, len(b.lang.slice()))
 865         for i, v := range b.lang.slice()[1:] {
 866                 if sc := b.registry[v].suppressScript; sc != "" {
 867                         supp[i+1] = uint8(b.script.index(sc))
 868                 }
 869         }
 870         b.writeSlice("suppressScript", supp)
 871
 872         // There is only one deprecated script in CLDR. This value is hard-coded.
 873         // We check here if the code must be updated.
 874         for _, a := range b.supp.Metadata.Alias.ScriptAlias {
 875                 if a.Type != "Qaai" {
 876                         log.Panicf("unexpected deprecated stript %q", a.Type)
 877                 }
 878         }
 879 }
 880
 881 func parseM49(s string) int16 {
 882         if len(s) == 0 {
 883                 return 0
 884         }
 885         v, err := strconv.ParseUint(s, 10, 10)
 886         failOnError(err)
 887         return int16(v)
 888 }
 889
 890 var regionConsts = []string{
 891         "001", "419", "BR", "CA", "ES", "GB", "MD", "PT", "UK", "US",
 892         "ZZ", "XA", "XC", "XK", // Unofficial tag for Kosovo.
 893 }
 894
 895 func (b *builder) writeRegion() {
 896         b.writeConsts(b.region.index, regionConsts...)
 897
 898         isoOffset := b.region.index("AA")
 899         m49map := make([]int16, len(b.region.slice()))
 900         fromM49map := make(map[int16]int)
 901         altRegionISO3 := ""
 902         altRegionIDs := []uint16{}
 903
 904         b.writeConst("isoRegionOffset", isoOffset)
 905
 906         // 2-letter region lookup and mapping to numeric codes.
 907         regionISO := b.region.clone()
 908         regionISO.s = regionISO.s[isoOffset:]
 909         regionISO.sorted = false
 910
 911         regionTypes := make([]byte, len(b.region.s))
 912
 913         // Is the region valid BCP 47?
 914         for s, e := range b.registry {
 915                 if len(s) == 2 && s == strings.ToUpper(s) {
 916                         i := b.region.index(s)
 917                         for _, d := range e.description {
 918                                 if strings.Contains(d, "Private use") {
 919                                         regionTypes[i] = iso3166UserAssigned
 920                                 }
 921                         }
 922                         regionTypes[i] |= bcp47Region
 923                 }
 924         }
 925
 926         // Is the region a valid ccTLD?
 927         r := gen.OpenIANAFile("domains/root/db")
 928         defer r.Close()
 929
 930         buf, err := ioutil.ReadAll(r)
 931         failOnError(err)
 932         re := regexp.MustCompile(`"/domains/root/db/([a-z]{2}).html"`)
 933         for _, m := range re.FindAllSubmatch(buf, -1) {
 934                 i := b.region.index(strings.ToUpper(string(m[1])))
 935                 regionTypes[i] |= ccTLD
 936         }
 937
 938         b.writeSlice("regionTypes", regionTypes)
 939
 940         iso3Set := make(map[string]int)
 941         update := func(iso2, iso3 string) {
 942                 i := regionISO.index(iso2)
 943                 if j, ok := iso3Set[iso3]; !ok && iso3[0] == iso2[0] {
 944                         regionISO.s[i] += iso3[1:]
 945                         iso3Set[iso3] = -1
 946                 } else {
 947                         if ok && j >= 0 {
 948                                 regionISO.s[i] += string([]byte{0, byte(j)})
 949                         } else {
 950                                 iso3Set[iso3] = len(altRegionISO3)
 951                                 regionISO.s[i] += string([]byte{0, byte(len(altRegionISO3))})
 952                                 altRegionISO3 += iso3
 953                                 altRegionIDs = append(altRegionIDs, uint16(isoOffset+i))
 954                         }
 955                 }
 956         }
 957         for _, tc := range b.supp.CodeMappings.TerritoryCodes {
 958                 i := regionISO.index(tc.Type) + isoOffset
 959                 if d := m49map[i]; d != 0 {
 960                         log.Panicf("%s found as a duplicate UN.M49 code of %03d", tc.Numeric, d)
 961                 }
 962                 m49 := parseM49(tc.Numeric)
 963                 m49map[i] = m49
 964                 if r := fromM49map[m49]; r == 0 {
 965                         fromM49map[m49] = i
 966                 } else if r != i {
 967                         dep := b.registry[regionISO.s[r-isoOffset]].deprecated
 968                         if t := b.registry[tc.Type]; t != nil && dep != "" && (t.deprecated == "" || t.deprecated > dep) {
 969                                 fromM49map[m49] = i
 970                         }
 971                 }
 972         }
 973         for _, ta := range b.supp.Metadata.Alias.TerritoryAlias {
 974                 if len(ta.Type) == 3 && ta.Type[0] <= '9' && len(ta.Replacement) == 2 {
 975                         from := parseM49(ta.Type)
 976                         if r := fromM49map[from]; r == 0 {
 977                                 fromM49map[from] = regionISO.index(ta.Replacement) + isoOffset
 978                         }
 979                 }
 980         }
 981         for _, tc := range b.supp.CodeMappings.TerritoryCodes {
 982                 if len(tc.Alpha3) == 3 {
 983                         update(tc.Type, tc.Alpha3)
 984                 }
 985         }
 986         // This entries are not included in territoryCodes. Mostly 3-letter variants
 987         // of deleted codes and an entry for QU.
 988         for _, m := range []struct{ iso2, iso3 string }{
 989                 {"CT", "CTE"},
 990                 {"DY", "DHY"},
 991                 {"HV", "HVO"},
 992                 {"JT", "JTN"},
 993                 {"MI", "MID"},
 994                 {"NH", "NHB"},
 995                 {"NQ", "ATN"},
 996                 {"PC", "PCI"},
 997                 {"PU", "PUS"},
 998                 {"PZ", "PCZ"},
 999                 {"RH", "RHO"},
1000                 {"VD", "VDR"},
1001                 {"WK", "WAK"},
1002                 // These three-letter codes are used for others as well.
1003                 {"FQ", "ATF"},
1004         } {
1005                 update(m.iso2, m.iso3)
1006         }
1007         for i, s := range regionISO.s {
1008                 if len(s) != 4 {
1009                         regionISO.s[i] = s + "  "
1010                 }
1011         }
1012         b.writeConst("regionISO", tag.Index(regionISO.join()))
1013         b.writeConst("altRegionISO3", altRegionISO3)
1014         b.writeSlice("altRegionIDs", altRegionIDs)
1015
1016         // Create list of deprecated regions.
1017         // TODO: consider inserting SF -> FI. Not included by CLDR, but is the only
1018         // Transitionally-reserved mapping not included.
1019         regionOldMap := stringSet{}
1020         // Include regions in territoryAlias (not all are in the IANA registry!)
1021         for _, reg := range b.supp.Metadata.Alias.TerritoryAlias {
1022                 if len(reg.Type) == 2 && reg.Reason == "deprecated" && len(reg.Replacement) == 2 {
1023                         regionOldMap.add(reg.Type)
1024                         regionOldMap.updateLater(reg.Type, reg.Replacement)
1025                         i, _ := regionISO.find(reg.Type)
1026                         j, _ := regionISO.find(reg.Replacement)
1027                         if k := m49map[i+isoOffset]; k == 0 {
1028                                 m49map[i+isoOffset] = m49map[j+isoOffset]
1029                         }
1030                 }
1031         }
1032         b.writeSortedMap("regionOldMap", &regionOldMap, func(s string) uint16 {
1033                 return uint16(b.region.index(s))
1034         })
1035         // 3-digit region lookup, groupings.
1036         for i := 1; i < isoOffset; i++ {
1037                 m := parseM49(b.region.s[i])
1038                 m49map[i] = m
1039                 fromM49map[m] = i
1040         }
1041         b.writeSlice("m49", m49map)
1042
1043         const (
1044                 searchBits = 7
1045                 regionBits = 9
1046         )
1047         if len(m49map) >= 1<<regionBits {
1048                 log.Fatalf("Maximum number of regions exceeded: %d > %d", len(m49map), 1<<regionBits)
1049         }
1050         m49Index := [9]int16{}
1051         fromM49 := []uint16{}
1052         m49 := []int{}
1053         for k, _ := range fromM49map {
1054                 m49 = append(m49, int(k))
1055         }
1056         sort.Ints(m49)
1057         for _, k := range m49[1:] {
1058                 val := (k & (1<<searchBits - 1)) << regionBits
1059                 fromM49 = append(fromM49, uint16(val|fromM49map[int16(k)]))
1060                 m49Index[1:][k>>searchBits] = int16(len(fromM49))
1061         }
1062         b.writeSlice("m49Index", m49Index)
1063         b.writeSlice("fromM49", fromM49)
1064 }
1065
1066 const (
1067         // TODO: put these lists in regionTypes as user data? Could be used for
1068         // various optimizations and refinements and could be exposed in the API.
1069         iso3166Except = "AC CP DG EA EU FX IC SU TA UK"
1070         iso3166Trans  = "AN BU CS NT TP YU ZR" // SF is not in our set of Regions.
1071         // DY and RH are actually not deleted, but indeterminately reserved.
1072         iso3166DelCLDR = "CT DD DY FQ HV JT MI NH NQ PC PU PZ RH VD WK YD"
1073 )
1074
1075 const (
1076         iso3166UserAssigned = 1 << iota
1077         ccTLD
1078         bcp47Region
1079 )
1080
1081 func find(list []string, s string) int {
1082         for i, t := range list {
1083                 if t == s {
1084                         return i
1085                 }
1086         }
1087         return -1
1088 }
1089
1090 // writeVariants generates per-variant information and creates a map from variant
1091 // name to index value. We assign index values such that sorting multiple
1092 // variants by index value will result in the correct order.
1093 // There are two types of variants: specialized and general. Specialized variants
1094 // are only applicable to certain language or language-script pairs. Generalized
1095 // variants apply to any language. Generalized variants always sort after
1096 // specialized variants.  We will therefore always assign a higher index value
1097 // to a generalized variant than any other variant. Generalized variants are
1098 // sorted alphabetically among themselves.
1099 // Specialized variants may also sort after other specialized variants. Such
1100 // variants will be ordered after any of the variants they may follow.
1101 // We assume that if a variant x is followed by a variant y, then for any prefix
1102 // p of x, p-x is a prefix of y. This allows us to order tags based on the
1103 // maximum of the length of any of its prefixes.
1104 // TODO: it is possible to define a set of Prefix values on variants such that
1105 // a total order cannot be defined to the point that this algorithm breaks.
1106 // In other words, we cannot guarantee the same order of variants for the
1107 // future using the same algorithm or for non-compliant combinations of
1108 // variants. For this reason, consider using simple alphabetic sorting
1109 // of variants and ignore Prefix restrictions altogether.
1110 func (b *builder) writeVariant() {
1111         generalized := stringSet{}
1112         specialized := stringSet{}
1113         specializedExtend := stringSet{}
1114         // Collate the variants by type and check assumptions.
1115         for _, v := range b.variant.slice() {
1116                 e := b.registry[v]
1117                 if len(e.prefix) == 0 {
1118                         generalized.add(v)
1119                         continue
1120                 }
1121                 c := strings.Split(e.prefix[0], "-")
1122                 hasScriptOrRegion := false
1123                 if len(c) > 1 {
1124                         _, hasScriptOrRegion = b.script.find(c[1])
1125                         if !hasScriptOrRegion {
1126                                 _, hasScriptOrRegion = b.region.find(c[1])
1127
1128                         }
1129                 }
1130                 if len(c) == 1 || len(c) == 2 && hasScriptOrRegion {
1131                         // Variant is preceded by a language.
1132                         specialized.add(v)
1133                         continue
1134                 }
1135                 // Variant is preceded by another variant.
1136                 specializedExtend.add(v)
1137                 prefix := c[0] + "-"
1138                 if hasScriptOrRegion {
1139                         prefix += c[1]
1140                 }
1141                 for _, p := range e.prefix {
1142                         // Verify that the prefix minus the last element is a prefix of the
1143                         // predecessor element.
1144                         i := strings.LastIndex(p, "-")
1145                         pred := b.registry[p[i+1:]]
1146                         if find(pred.prefix, p[:i]) < 0 {
1147                                 log.Fatalf("prefix %q for variant %q not consistent with predecessor spec", p, v)
1148                         }
1149                         // The sorting used below does not work in the general case. It works
1150                         // if we assume that variants that may be followed by others only have
1151                         // prefixes of the same length. Verify this.
1152                         count := strings.Count(p[:i], "-")
1153                         for _, q := range pred.prefix {
1154                                 if c := strings.Count(q, "-"); c != count {
1155                                         log.Fatalf("variant %q preceding %q has a prefix %q of size %d; want %d", p[i+1:], v, q, c, count)
1156                                 }
1157                         }
1158                         if !strings.HasPrefix(p, prefix) {
1159                                 log.Fatalf("prefix %q of variant %q should start with %q", p, v, prefix)
1160                         }
1161                 }
1162         }
1163
1164         // Sort extended variants.
1165         a := specializedExtend.s
1166         less := func(v, w string) bool {
1167                 // Sort by the maximum number of elements.
1168                 maxCount := func(s string) (max int) {
1169                         for _, p := range b.registry[s].prefix {
1170                                 if c := strings.Count(p, "-"); c > max {
1171                                         max = c
1172                                 }
1173                         }
1174                         return
1175                 }
1176                 if cv, cw := maxCount(v), maxCount(w); cv != cw {
1177                         return cv < cw
1178                 }
1179                 // Sort by name as tie breaker.
1180                 return v < w
1181         }
1182         sort.Sort(funcSorter{less, sort.StringSlice(a)})
1183         specializedExtend.frozen = true
1184
1185         // Create index from variant name to index.
1186         variantIndex := make(map[string]uint8)
1187         add := func(s []string) {
1188                 for _, v := range s {
1189                         variantIndex[v] = uint8(len(variantIndex))
1190                 }
1191         }
1192         add(specialized.slice())
1193         add(specializedExtend.s)
1194         numSpecialized := len(variantIndex)
1195         add(generalized.slice())
1196         if n := len(variantIndex); n > 255 {
1197                 log.Fatalf("maximum number of variants exceeded: was %d; want <= 255", n)
1198         }
1199         b.writeMap("variantIndex", variantIndex)
1200         b.writeConst("variantNumSpecialized", numSpecialized)
1201 }
1202
1203 func (b *builder) writeLanguageInfo() {
1204 }
1205
1206 // writeLikelyData writes tables that are used both for finding parent relations and for
1207 // language matching.  Each entry contains additional bits to indicate the status of the
1208 // data to know when it cannot be used for parent relations.
1209 func (b *builder) writeLikelyData() {
1210         const (
1211                 isList = 1 << iota
1212                 scriptInFrom
1213                 regionInFrom
1214         )
1215         type ( // generated types
1216                 likelyScriptRegion struct {
1217                         region uint16
1218                         script uint8
1219                         flags  uint8
1220                 }
1221                 likelyLangScript struct {
1222                         lang   uint16
1223                         script uint8
1224                         flags  uint8
1225                 }
1226                 likelyLangRegion struct {
1227                         lang   uint16
1228                         region uint16
1229                 }
1230                 // likelyTag is used for getting likely tags for group regions, where
1231                 // the likely region might be a region contained in the group.
1232                 likelyTag struct {
1233                         lang   uint16
1234                         region uint16
1235                         script uint8
1236                 }
1237         )
1238         var ( // generated variables
1239                 likelyRegionGroup = make([]likelyTag, len(b.groups))
1240                 likelyLang        = make([]likelyScriptRegion, len(b.lang.s))
1241                 likelyRegion      = make([]likelyLangScript, len(b.region.s))
1242                 likelyScript      = make([]likelyLangRegion, len(b.script.s))
1243                 likelyLangList    = []likelyScriptRegion{}
1244                 likelyRegionList  = []likelyLangScript{}
1245         )
1246         type fromTo struct {
1247                 from, to []string
1248         }
1249         langToOther := map[int][]fromTo{}
1250         regionToOther := map[int][]fromTo{}
1251         for _, m := range b.supp.LikelySubtags.LikelySubtag {
1252                 from := strings.Split(m.From, "_")
1253                 to := strings.Split(m.To, "_")
1254                 if len(to) != 3 {
1255                         log.Fatalf("invalid number of subtags in %q: found %d, want 3", m.To, len(to))
1256                 }
1257                 if len(from) > 3 {
1258                         log.Fatalf("invalid number of subtags: found %d, want 1-3", len(from))
1259                 }
1260                 if from[0] != to[0] && from[0] != "und" {
1261                         log.Fatalf("unexpected language change in expansion: %s -> %s", from, to)
1262                 }
1263                 if len(from) == 3 {
1264                         if from[2] != to[2] {
1265                                 log.Fatalf("unexpected region change in expansion: %s -> %s", from, to)
1266                         }
1267                         if from[0] != "und" {
1268                                 log.Fatalf("unexpected fully specified from tag: %s -> %s", from, to)
1269                         }
1270                 }
1271                 if len(from) == 1 || from[0] != "und" {
1272                         id := 0
1273                         if from[0] != "und" {
1274                                 id = b.lang.index(from[0])
1275                         }
1276                         langToOther[id] = append(langToOther[id], fromTo{from, to})
1277                 } else if len(from) == 2 && len(from[1]) == 4 {
1278                         sid := b.script.index(from[1])
1279                         likelyScript[sid].lang = uint16(b.langIndex(to[0]))
1280                         likelyScript[sid].region = uint16(b.region.index(to[2]))
1281                 } else {
1282                         r := b.region.index(from[len(from)-1])
1283                         if id, ok := b.groups[r]; ok {
1284                                 if from[0] != "und" {
1285                                         log.Fatalf("region changed unexpectedly: %s -> %s", from, to)
1286                                 }
1287                                 likelyRegionGroup[id].lang = uint16(b.langIndex(to[0]))
1288                                 likelyRegionGroup[id].script = uint8(b.script.index(to[1]))
1289                                 likelyRegionGroup[id].region = uint16(b.region.index(to[2]))
1290                         } else {
1291                                 regionToOther[r] = append(regionToOther[r], fromTo{from, to})
1292                         }
1293                 }
1294         }
1295         b.writeType(likelyLangRegion{})
1296         b.writeSlice("likelyScript", likelyScript)
1297
1298         for id := range b.lang.s {
1299                 list := langToOther[id]
1300                 if len(list) == 1 {
1301                         likelyLang[id].region = uint16(b.region.index(list[0].to[2]))
1302                         likelyLang[id].script = uint8(b.script.index(list[0].to[1]))
1303                 } else if len(list) > 1 {
1304                         likelyLang[id].flags = isList
1305                         likelyLang[id].region = uint16(len(likelyLangList))
1306                         likelyLang[id].script = uint8(len(list))
1307                         for _, x := range list {
1308                                 flags := uint8(0)
1309                                 if len(x.from) > 1 {
1310                                         if x.from[1] == x.to[2] {
1311                                                 flags = regionInFrom
1312                                         } else {
1313                                                 flags = scriptInFrom
1314                                         }
1315                                 }
1316                                 likelyLangList = append(likelyLangList, likelyScriptRegion{
1317                                         region: uint16(b.region.index(x.to[2])),
1318                                         script: uint8(b.script.index(x.to[1])),
1319                                         flags:  flags,
1320                                 })
1321                         }
1322                 }
1323         }
1324         // TODO: merge suppressScript data with this table.
1325         b.writeType(likelyScriptRegion{})
1326         b.writeSlice("likelyLang", likelyLang)
1327         b.writeSlice("likelyLangList", likelyLangList)
1328
1329         for id := range b.region.s {
1330                 list := regionToOther[id]
1331                 if len(list) == 1 {
1332                         likelyRegion[id].lang = uint16(b.langIndex(list[0].to[0]))
1333                         likelyRegion[id].script = uint8(b.script.index(list[0].to[1]))
1334                         if len(list[0].from) > 2 {
1335                                 likelyRegion[id].flags = scriptInFrom
1336                         }
1337                 } else if len(list) > 1 {
1338                         likelyRegion[id].flags = isList
1339                         likelyRegion[id].lang = uint16(len(likelyRegionList))
1340                         likelyRegion[id].script = uint8(len(list))
1341                         for i, x := range list {
1342                                 if len(x.from) == 2 && i != 0 || i > 0 && len(x.from) != 3 {
1343                                         log.Fatalf("unspecified script must be first in list: %v at %d", x.from, i)
1344                                 }
1345                                 x := likelyLangScript{
1346                                         lang:   uint16(b.langIndex(x.to[0])),
1347                                         script: uint8(b.script.index(x.to[1])),
1348                                 }
1349                                 if len(list[0].from) > 2 {
1350                                         x.flags = scriptInFrom
1351                                 }
1352                                 likelyRegionList = append(likelyRegionList, x)
1353                         }
1354                 }
1355         }
1356         b.writeType(likelyLangScript{})
1357         b.writeSlice("likelyRegion", likelyRegion)
1358         b.writeSlice("likelyRegionList", likelyRegionList)
1359
1360         b.writeType(likelyTag{})
1361         b.writeSlice("likelyRegionGroup", likelyRegionGroup)
1362 }
1363
1364 type mutualIntelligibility struct {
1365         want, have uint16
1366         distance   uint8
1367         oneway     bool
1368 }
1369
1370 type scriptIntelligibility struct {
1371         wantLang, haveLang     uint16
1372         wantScript, haveScript uint8
1373         distance               uint8
1374         // Always oneway
1375 }
1376
1377 type regionIntelligibility struct {
1378         lang     uint16 // compact language id
1379         script   uint8  // 0 means any
1380         group    uint8  // 0 means any; if bit 7 is set it means inverse
1381         distance uint8
1382         // Always twoway.
1383 }
1384
1385 // writeMatchData writes tables with languages and scripts for which there is
1386 // mutual intelligibility. The data is based on CLDR's languageMatching data.
1387 // Note that we use a different algorithm than the one defined by CLDR and that
1388 // we slightly modify the data. For example, we convert scores to confidence levels.
1389 // We also drop all region-related data as we use a different algorithm to
1390 // determine region equivalence.
1391 func (b *builder) writeMatchData() {
1392         lm := b.supp.LanguageMatching.LanguageMatches
1393         cldr.MakeSlice(&lm).SelectAnyOf("type", "written_new")
1394
1395         regionHierarchy := map[string][]string{}
1396         for _, g := range b.supp.TerritoryContainment.Group {
1397                 regions := strings.Split(g.Contains, " ")
1398                 regionHierarchy[g.Type] = append(regionHierarchy[g.Type], regions...)
1399         }
1400         regionToGroups := make([]uint8, len(b.region.s))
1401
1402         idToIndex := map[string]uint8{}
1403         for i, mv := range lm[0].MatchVariable {
1404                 if i > 6 {
1405                         log.Fatalf("Too many groups: %d", i)
1406                 }
1407                 idToIndex[mv.Id] = uint8(i + 1)
1408                 // TODO: also handle '-'
1409                 for _, r := range strings.Split(mv.Value, "+") {
1410                         todo := []string{r}
1411                         for k := 0; k < len(todo); k++ {
1412                                 r := todo[k]
1413                                 regionToGroups[b.region.index(r)] |= 1 << uint8(i)
1414                                 todo = append(todo, regionHierarchy[r]...)
1415                         }
1416                 }
1417         }
1418         b.writeSlice("regionToGroups", regionToGroups)
1419
1420         // maps language id to in- and out-of-group region.
1421         paradigmLocales := [][3]uint16{}
1422         locales := strings.Split(lm[0].ParadigmLocales[0].Locales, " ")
1423         for i := 0; i < len(locales); i += 2 {
1424                 x := [3]uint16{}
1425                 for j := 0; j < 2; j++ {
1426                         pc := strings.SplitN(locales[i+j], "-", 2)
1427                         x[0] = b.langIndex(pc[0])
1428                         if len(pc) == 2 {
1429                                 x[1+j] = uint16(b.region.index(pc[1]))
1430                         }
1431                 }
1432                 paradigmLocales = append(paradigmLocales, x)
1433         }
1434         b.writeSlice("paradigmLocales", paradigmLocales)
1435
1436         b.writeType(mutualIntelligibility{})
1437         b.writeType(scriptIntelligibility{})
1438         b.writeType(regionIntelligibility{})
1439
1440         matchLang := []mutualIntelligibility{}
1441         matchScript := []scriptIntelligibility{}
1442         matchRegion := []regionIntelligibility{}
1443         // Convert the languageMatch entries in lists keyed by desired language.
1444         for _, m := range lm[0].LanguageMatch {
1445                 // Different versions of CLDR use different separators.
1446                 desired := strings.Replace(m.Desired, "-", "_", -1)
1447                 supported := strings.Replace(m.Supported, "-", "_", -1)
1448                 d := strings.Split(desired, "_")
1449                 s := strings.Split(supported, "_")
1450                 if len(d) != len(s) {
1451                         log.Fatalf("not supported: desired=%q; supported=%q", desired, supported)
1452                         continue
1453                 }
1454                 distance, _ := strconv.ParseInt(m.Distance, 10, 8)
1455                 switch len(d) {
1456                 case 2:
1457                         if desired == supported && desired == "*_*" {
1458                                 continue
1459                         }
1460                         // language-script pair.
1461                         matchScript = append(matchScript, scriptIntelligibility{
1462                                 wantLang:   uint16(b.langIndex(d[0])),
1463                                 haveLang:   uint16(b.langIndex(s[0])),
1464                                 wantScript: uint8(b.script.index(d[1])),
1465                                 haveScript: uint8(b.script.index(s[1])),
1466                                 distance:   uint8(distance),
1467                         })
1468                         if m.Oneway != "true" {
1469                                 matchScript = append(matchScript, scriptIntelligibility{
1470                                         wantLang:   uint16(b.langIndex(s[0])),
1471                                         haveLang:   uint16(b.langIndex(d[0])),
1472                                         wantScript: uint8(b.script.index(s[1])),
1473                                         haveScript: uint8(b.script.index(d[1])),
1474                                         distance:   uint8(distance),
1475                                 })
1476                         }
1477                 case 1:
1478                         if desired == supported && desired == "*" {
1479                                 continue
1480                         }
1481                         if distance == 1 {
1482                                 // nb == no is already handled by macro mapping. Check there
1483                                 // really is only this case.
1484                                 if d[0] != "no" || s[0] != "nb" {
1485                                         log.Fatalf("unhandled equivalence %s == %s", s[0], d[0])
1486                                 }
1487                                 continue
1488                         }
1489                         // TODO: consider dropping oneway field and just doubling the entry.
1490                         matchLang = append(matchLang, mutualIntelligibility{
1491                                 want:     uint16(b.langIndex(d[0])),
1492                                 have:     uint16(b.langIndex(s[0])),
1493                                 distance: uint8(distance),
1494                                 oneway:   m.Oneway == "true",
1495                         })
1496                 case 3:
1497                         if desired == supported && desired == "*_*_*" {
1498                                 continue
1499                         }
1500                         if desired != supported { // (Weird but correct.)
1501                                 log.Fatalf("not supported: desired=%q; supported=%q", desired, supported)
1502                                 continue
1503                         }
1504                         ri := regionIntelligibility{
1505                                 lang:     b.langIndex(d[0]),
1506                                 distance: uint8(distance),
1507                         }
1508                         if d[1] != "*" {
1509                                 ri.script = uint8(b.script.index(d[1]))
1510                         }
1511                         switch {
1512                         case d[2] == "*":
1513                                 ri.group = 0x80 // not contained in anything
1514                         case strings.HasPrefix(d[2], "$!"):
1515                                 ri.group = 0x80
1516                                 d[2] = "$" + d[2][len("$!"):]
1517                                 fallthrough
1518                         case strings.HasPrefix(d[2], "$"):
1519                                 ri.group |= idToIndex[d[2]]
1520                         }
1521                         matchRegion = append(matchRegion, ri)
1522                 default:
1523                         log.Fatalf("not supported: desired=%q; supported=%q", desired, supported)
1524                 }
1525         }
1526         sort.SliceStable(matchLang, func(i, j int) bool {
1527                 return matchLang[i].distance < matchLang[j].distance
1528         })
1529         b.writeSlice("matchLang", matchLang)
1530
1531         sort.SliceStable(matchScript, func(i, j int) bool {
1532                 return matchScript[i].distance < matchScript[j].distance
1533         })
1534         b.writeSlice("matchScript", matchScript)
1535
1536         sort.SliceStable(matchRegion, func(i, j int) bool {
1537                 return matchRegion[i].distance < matchRegion[j].distance
1538         })
1539         b.writeSlice("matchRegion", matchRegion)
1540 }
1541
1542 func (b *builder) writeRegionInclusionData() {
1543         var (
1544                 // mm holds for each group the set of groups with a distance of 1.
1545                 mm = make(map[int][]index)
1546
1547                 // containment holds for each group the transitive closure of
1548                 // containment of other groups.
1549                 containment = make(map[index][]index)
1550         )
1551         for _, g := range b.supp.TerritoryContainment.Group {
1552                 // Skip UN and EURO zone as they are flattening the containment
1553                 // relationship.
1554                 if g.Type == "EZ" || g.Type == "UN" {
1555                         continue
1556                 }
1557                 group := b.region.index(g.Type)
1558                 groupIdx := b.groups[group]
1559                 for _, mem := range strings.Split(g.Contains, " ") {
1560                         r := b.region.index(mem)
1561                         mm[r] = append(mm[r], groupIdx)
1562                         if g, ok := b.groups[r]; ok {
1563                                 mm[group] = append(mm[group], g)
1564                                 containment[groupIdx] = append(containment[groupIdx], g)
1565                         }
1566                 }
1567         }
1568
1569         regionContainment := make([]uint64, len(b.groups))
1570         for _, g := range b.groups {
1571                 l := containment[g]
1572
1573                 // Compute the transitive closure of containment.
1574                 for i := 0; i < len(l); i++ {
1575                         l = append(l, containment[l[i]]...)
1576                 }
1577
1578                 // Compute the bitmask.
1579                 regionContainment[g] = 1 << g
1580                 for _, v := range l {
1581                         regionContainment[g] |= 1 << v
1582                 }
1583         }
1584         b.writeSlice("regionContainment", regionContainment)
1585
1586         regionInclusion := make([]uint8, len(b.region.s))
1587         bvs := make(map[uint64]index)
1588         // Make the first bitvector positions correspond with the groups.
1589         for r, i := range b.groups {
1590                 bv := uint64(1 << i)
1591                 for _, g := range mm[r] {
1592                         bv |= 1 << g
1593                 }
1594                 bvs[bv] = i
1595                 regionInclusion[r] = uint8(bvs[bv])
1596         }
1597         for r := 1; r < len(b.region.s); r++ {
1598                 if _, ok := b.groups[r]; !ok {
1599                         bv := uint64(0)
1600                         for _, g := range mm[r] {
1601                                 bv |= 1 << g
1602                         }
1603                         if bv == 0 {
1604                                 // Pick the world for unspecified regions.
1605                                 bv = 1 << b.groups[b.region.index("001")]
1606                         }
1607                         if _, ok := bvs[bv]; !ok {
1608                                 bvs[bv] = index(len(bvs))
1609                         }
1610                         regionInclusion[r] = uint8(bvs[bv])
1611                 }
1612         }
1613         b.writeSlice("regionInclusion", regionInclusion)
1614         regionInclusionBits := make([]uint64, len(bvs))
1615         for k, v := range bvs {
1616                 regionInclusionBits[v] = uint64(k)
1617         }
1618         // Add bit vectors for increasingly large distances until a fixed point is reached.
1619         regionInclusionNext := []uint8{}
1620         for i := 0; i < len(regionInclusionBits); i++ {
1621                 bits := regionInclusionBits[i]
1622                 next := bits
1623                 for i := uint(0); i < uint(len(b.groups)); i++ {
1624                         if bits&(1<<i) != 0 {
1625                                 next |= regionInclusionBits[i]
1626                         }
1627                 }
1628                 if _, ok := bvs[next]; !ok {
1629                         bvs[next] = index(len(bvs))
1630                         regionInclusionBits = append(regionInclusionBits, next)
1631                 }
1632                 regionInclusionNext = append(regionInclusionNext, uint8(bvs[next]))
1633         }
1634         b.writeSlice("regionInclusionBits", regionInclusionBits)
1635         b.writeSlice("regionInclusionNext", regionInclusionNext)
1636 }
1637
1638 type parentRel struct {
1639         lang       uint16
1640         script     uint8
1641         maxScript  uint8
1642         toRegion   uint16
1643         fromRegion []uint16
1644 }
1645
1646 func (b *builder) writeParents() {
1647         b.writeType(parentRel{})
1648
1649         parents := []parentRel{}
1650
1651         // Construct parent overrides.
1652         n := 0
1653         for _, p := range b.data.Supplemental().ParentLocales.ParentLocale {
1654                 // Skipping non-standard scripts to root is implemented using addTags.
1655                 if p.Parent == "root" {
1656                         continue
1657                 }
1658
1659                 sub := strings.Split(p.Parent, "_")
1660                 parent := parentRel{lang: b.langIndex(sub[0])}
1661                 if len(sub) == 2 {
1662                         // TODO: check that all undefined scripts are indeed Latn in these
1663                         // cases.
1664                         parent.maxScript = uint8(b.script.index("Latn"))
1665                         parent.toRegion = uint16(b.region.index(sub[1]))
1666                 } else {
1667                         parent.script = uint8(b.script.index(sub[1]))
1668                         parent.maxScript = parent.script
1669                         parent.toRegion = uint16(b.region.index(sub[2]))
1670                 }
1671                 for _, c := range strings.Split(p.Locales, " ") {
1672                         region := b.region.index(c[strings.LastIndex(c, "_")+1:])
1673                         parent.fromRegion = append(parent.fromRegion, uint16(region))
1674                 }
1675                 parents = append(parents, parent)
1676                 n += len(parent.fromRegion)
1677         }
1678         b.writeSliceAddSize("parents", n*2, parents)
1679 }
1680
1681 func main() {
1682         gen.Init()
1683
1684         gen.Repackage("gen_common.go", "common.go", "language")
1685
1686         w := gen.NewCodeWriter()
1687         defer w.WriteGoFile("tables.go", "language")
1688
1689         fmt.Fprintln(w, `import "golang.org/x/text/internal/tag"`)
1690
1691         b := newBuilder(w)
1692         gen.WriteCLDRVersion(w)
1693
1694         b.parseIndices()
1695         b.writeType(fromTo{})
1696         b.writeLanguage()
1697         b.writeScript()
1698         b.writeRegion()
1699         b.writeVariant()
1700         // TODO: b.writeLocale()
1701         b.computeRegionGroups()
1702         b.writeLikelyData()
1703         b.writeMatchData()
1704         b.writeRegionInclusionData()
1705         b.writeParents()
1706 }