vendor/golang.org/x/text/cases/gen.go

   1 // Copyright 2014 The Go Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style
   3 // license that can be found in the LICENSE file.
   4
   5 // +build ignore
   6
   7 // This program generates the trie for casing operations. The Unicode casing
   8 // algorithm requires the lookup of various properties and mappings for each
   9 // rune. The table generated by this generator combines several of the most
  10 // frequently used of these into a single trie so that they can be accessed
  11 // with a single lookup.
  12 package main
  13
  14 import (
  15         "bytes"
  16         "fmt"
  17         "io"
  18         "io/ioutil"
  19         "log"
  20         "reflect"
  21         "strconv"
  22         "strings"
  23         "unicode"
  24
  25         "golang.org/x/text/internal/gen"
  26         "golang.org/x/text/internal/triegen"
  27         "golang.org/x/text/internal/ucd"
  28         "golang.org/x/text/unicode/norm"
  29 )
  30
  31 func main() {
  32         gen.Init()
  33         genTables()
  34         genTablesTest()
  35         gen.Repackage("gen_trieval.go", "trieval.go", "cases")
  36 }
  37
  38 // runeInfo contains all information for a rune that we care about for casing
  39 // operations.
  40 type runeInfo struct {
  41         Rune rune
  42
  43         entry info // trie value for this rune.
  44
  45         CaseMode info
  46
  47         // Simple case mappings.
  48         Simple [1 + maxCaseMode][]rune
  49
  50         // Special casing
  51         HasSpecial  bool
  52         Conditional bool
  53         Special     [1 + maxCaseMode][]rune
  54
  55         // Folding
  56         FoldSimple  rune
  57         FoldSpecial rune
  58         FoldFull    []rune
  59
  60         // TODO: FC_NFKC, or equivalent data.
  61
  62         // Properties
  63         SoftDotted     bool
  64         CaseIgnorable  bool
  65         Cased          bool
  66         DecomposeGreek bool
  67         BreakType      string
  68         BreakCat       breakCategory
  69
  70         // We care mostly about 0, Above, and IotaSubscript.
  71         CCC byte
  72 }
  73
  74 type breakCategory int
  75
  76 const (
  77         breakBreak breakCategory = iota
  78         breakLetter
  79         breakMid
  80 )
  81
  82 // mapping returns the case mapping for the given case type.
  83 func (r *runeInfo) mapping(c info) string {
  84         if r.HasSpecial {
  85                 return string(r.Special[c])
  86         }
  87         if len(r.Simple[c]) != 0 {
  88                 return string(r.Simple[c])
  89         }
  90         return string(r.Rune)
  91 }
  92
  93 func parse(file string, f func(p *ucd.Parser)) {
  94         ucd.Parse(gen.OpenUCDFile(file), f)
  95 }
  96
  97 func parseUCD() []runeInfo {
  98         chars := make([]runeInfo, unicode.MaxRune)
  99
 100         get := func(r rune) *runeInfo {
 101                 c := &chars[r]
 102                 c.Rune = r
 103                 return c
 104         }
 105
 106         parse("UnicodeData.txt", func(p *ucd.Parser) {
 107                 ri := get(p.Rune(0))
 108                 ri.CCC = byte(p.Int(ucd.CanonicalCombiningClass))
 109                 ri.Simple[cLower] = p.Runes(ucd.SimpleLowercaseMapping)
 110                 ri.Simple[cUpper] = p.Runes(ucd.SimpleUppercaseMapping)
 111                 ri.Simple[cTitle] = p.Runes(ucd.SimpleTitlecaseMapping)
 112                 if p.String(ucd.GeneralCategory) == "Lt" {
 113                         ri.CaseMode = cTitle
 114                 }
 115         })
 116
 117         // <code>; <property>
 118         parse("PropList.txt", func(p *ucd.Parser) {
 119                 if p.String(1) == "Soft_Dotted" {
 120                         chars[p.Rune(0)].SoftDotted = true
 121                 }
 122         })
 123
 124         // <code>; <word break type>
 125         parse("DerivedCoreProperties.txt", func(p *ucd.Parser) {
 126                 ri := get(p.Rune(0))
 127                 switch p.String(1) {
 128                 case "Case_Ignorable":
 129                         ri.CaseIgnorable = true
 130                 case "Cased":
 131                         ri.Cased = true
 132                 case "Lowercase":
 133                         ri.CaseMode = cLower
 134                 case "Uppercase":
 135                         ri.CaseMode = cUpper
 136                 }
 137         })
 138
 139         // <code>; <lower> ; <title> ; <upper> ; (<condition_list> ;)?
 140         parse("SpecialCasing.txt", func(p *ucd.Parser) {
 141                 // We drop all conditional special casing and deal with them manually in
 142                 // the language-specific case mappers. Rune 0x03A3 is the only one with
 143                 // a conditional formatting that is not language-specific. However,
 144                 // dealing with this letter is tricky, especially in a streaming
 145                 // context, so we deal with it in the Caser for Greek specifically.
 146                 ri := get(p.Rune(0))
 147                 if p.String(4) == "" {
 148                         ri.HasSpecial = true
 149                         ri.Special[cLower] = p.Runes(1)
 150                         ri.Special[cTitle] = p.Runes(2)
 151                         ri.Special[cUpper] = p.Runes(3)
 152                 } else {
 153                         ri.Conditional = true
 154                 }
 155         })
 156
 157         // TODO: Use text breaking according to UAX #29.
 158         // <code>; <word break type>
 159         parse("auxiliary/WordBreakProperty.txt", func(p *ucd.Parser) {
 160                 ri := get(p.Rune(0))
 161                 ri.BreakType = p.String(1)
 162
 163                 // We collapse the word breaking properties onto the categories we need.
 164                 switch p.String(1) { // TODO: officially we need to canonicalize.
 165                 case "MidLetter", "MidNumLet", "Single_Quote":
 166                         ri.BreakCat = breakMid
 167                         if !ri.CaseIgnorable {
 168                                 // finalSigma relies on the fact that all breakMid runes are
 169                                 // also a Case_Ignorable. Revisit this code when this changes.
 170                                 log.Fatalf("Rune %U, which has a break category mid, is not a case ignorable", ri)
 171                         }
 172                 case "ALetter", "Hebrew_Letter", "Numeric", "Extend", "ExtendNumLet", "Format", "ZWJ":
 173                         ri.BreakCat = breakLetter
 174                 }
 175         })
 176
 177         // <code>; <type>; <mapping>
 178         parse("CaseFolding.txt", func(p *ucd.Parser) {
 179                 ri := get(p.Rune(0))
 180                 switch p.String(1) {
 181                 case "C":
 182                         ri.FoldSimple = p.Rune(2)
 183                         ri.FoldFull = p.Runes(2)
 184                 case "S":
 185                         ri.FoldSimple = p.Rune(2)
 186                 case "T":
 187                         ri.FoldSpecial = p.Rune(2)
 188                 case "F":
 189                         ri.FoldFull = p.Runes(2)
 190                 default:
 191                         log.Fatalf("%U: unknown type: %s", p.Rune(0), p.String(1))
 192                 }
 193         })
 194
 195         return chars
 196 }
 197
 198 func genTables() {
 199         chars := parseUCD()
 200         verifyProperties(chars)
 201
 202         t := triegen.NewTrie("case")
 203         for i := range chars {
 204                 c := &chars[i]
 205                 makeEntry(c)
 206                 t.Insert(rune(i), uint64(c.entry))
 207         }
 208
 209         w := gen.NewCodeWriter()
 210         defer w.WriteGoFile("tables.go", "cases")
 211
 212         gen.WriteUnicodeVersion(w)
 213
 214         // TODO: write CLDR version after adding a mechanism to detect that the
 215         // tables on which the manually created locale-sensitive casing code is
 216         // based hasn't changed.
 217
 218         w.WriteVar("xorData", string(xorData))
 219         w.WriteVar("exceptions", string(exceptionData))
 220
 221         sz, err := t.Gen(w, triegen.Compact(&sparseCompacter{}))
 222         if err != nil {
 223                 log.Fatal(err)
 224         }
 225         w.Size += sz
 226 }
 227
 228 func makeEntry(ri *runeInfo) {
 229         if ri.CaseIgnorable {
 230                 if ri.Cased {
 231                         ri.entry = cIgnorableCased
 232                 } else {
 233                         ri.entry = cIgnorableUncased
 234                 }
 235         } else {
 236                 ri.entry = ri.CaseMode
 237         }
 238
 239         // TODO: handle soft-dotted.
 240
 241         ccc := cccOther
 242         switch ri.CCC {
 243         case 0: // Not_Reordered
 244                 ccc = cccZero
 245         case above: // Above
 246                 ccc = cccAbove
 247         }
 248         switch ri.BreakCat {
 249         case breakBreak:
 250                 ccc = cccBreak
 251         case breakMid:
 252                 ri.entry |= isMidBit
 253         }
 254
 255         ri.entry |= ccc
 256
 257         if ri.CaseMode == cUncased {
 258                 return
 259         }
 260
 261         // Need to do something special.
 262         if ri.CaseMode == cTitle || ri.HasSpecial || ri.mapping(cTitle) != ri.mapping(cUpper) {
 263                 makeException(ri)
 264                 return
 265         }
 266         if f := string(ri.FoldFull); len(f) > 0 && f != ri.mapping(cUpper) && f != ri.mapping(cLower) {
 267                 makeException(ri)
 268                 return
 269         }
 270
 271         // Rune is either lowercase or uppercase.
 272
 273         orig := string(ri.Rune)
 274         mapped := ""
 275         if ri.CaseMode == cUpper {
 276                 mapped = ri.mapping(cLower)
 277         } else {
 278                 mapped = ri.mapping(cUpper)
 279         }
 280
 281         if len(orig) != len(mapped) {
 282                 makeException(ri)
 283                 return
 284         }
 285
 286         if string(ri.FoldFull) == ri.mapping(cUpper) {
 287                 ri.entry |= inverseFoldBit
 288         }
 289
 290         n := len(orig)
 291
 292         // Create per-byte XOR mask.
 293         var b []byte
 294         for i := 0; i < n; i++ {
 295                 b = append(b, orig[i]^mapped[i])
 296         }
 297
 298         // Remove leading 0 bytes, but keep at least one byte.
 299         for ; len(b) > 1 && b[0] == 0; b = b[1:] {
 300         }
 301
 302         if len(b) == 1 && b[0]&0xc0 == 0 {
 303                 ri.entry |= info(b[0]) << xorShift
 304                 return
 305         }
 306
 307         key := string(b)
 308         x, ok := xorCache[key]
 309         if !ok {
 310                 xorData = append(xorData, 0) // for detecting start of sequence
 311                 xorData = append(xorData, b...)
 312
 313                 x = len(xorData) - 1
 314                 xorCache[key] = x
 315         }
 316         ri.entry |= info(x<<xorShift) | xorIndexBit
 317 }
 318
 319 var xorCache = map[string]int{}
 320
 321 // xorData contains byte-wise XOR data for the least significant bytes of a
 322 // UTF-8 encoded rune. An index points to the last byte. The sequence starts
 323 // with a zero terminator.
 324 var xorData = []byte{}
 325
 326 // See the comments in gen_trieval.go re "the exceptions slice".
 327 var exceptionData = []byte{0}
 328
 329 // makeException encodes case mappings that cannot be expressed in a simple
 330 // XOR diff.
 331 func makeException(ri *runeInfo) {
 332         ccc := ri.entry & cccMask
 333         // Set exception bit and retain case type.
 334         ri.entry &= 0x0007
 335         ri.entry |= exceptionBit
 336
 337         if len(exceptionData) >= 1<<numExceptionBits {
 338                 log.Fatalf("%U:exceptionData too large %x > %d bits", ri.Rune, len(exceptionData), numExceptionBits)
 339         }
 340
 341         // Set the offset in the exceptionData array.
 342         ri.entry |= info(len(exceptionData) << exceptionShift)
 343
 344         orig := string(ri.Rune)
 345         tc := ri.mapping(cTitle)
 346         uc := ri.mapping(cUpper)
 347         lc := ri.mapping(cLower)
 348         ff := string(ri.FoldFull)
 349
 350         // addString sets the length of a string and adds it to the expansions array.
 351         addString := func(s string, b *byte) {
 352                 if len(s) == 0 {
 353                         // Zero-length mappings exist, but only for conditional casing,
 354                         // which we are representing outside of this table.
 355                         log.Fatalf("%U: has zero-length mapping.", ri.Rune)
 356                 }
 357                 *b <<= 3
 358                 if s != orig {
 359                         n := len(s)
 360                         if n > 7 {
 361                                 log.Fatalf("%U: mapping larger than 7 (%d)", ri.Rune, n)
 362                         }
 363                         *b |= byte(n)
 364                         exceptionData = append(exceptionData, s...)
 365                 }
 366         }
 367
 368         // byte 0:
 369         exceptionData = append(exceptionData, byte(ccc)|byte(len(ff)))
 370
 371         // byte 1:
 372         p := len(exceptionData)
 373         exceptionData = append(exceptionData, 0)
 374
 375         if len(ff) > 7 { // May be zero-length.
 376                 log.Fatalf("%U: fold string larger than 7 (%d)", ri.Rune, len(ff))
 377         }
 378         exceptionData = append(exceptionData, ff...)
 379         ct := ri.CaseMode
 380         if ct != cLower {
 381                 addString(lc, &exceptionData[p])
 382         }
 383         if ct != cUpper {
 384                 addString(uc, &exceptionData[p])
 385         }
 386         if ct != cTitle {
 387                 // If title is the same as upper, we set it to the original string so
 388                 // that it will be marked as not present. This implies title case is
 389                 // the same as upper case.
 390                 if tc == uc {
 391                         tc = orig
 392                 }
 393                 addString(tc, &exceptionData[p])
 394         }
 395 }
 396
 397 // sparseCompacter is a trie value block Compacter. There are many cases where
 398 // successive runes alternate between lower- and upper-case. This Compacter
 399 // exploits this by adding a special case type where the case value is obtained
 400 // from or-ing it with the least-significant bit of the rune, creating large
 401 // ranges of equal case values that compress well.
 402 type sparseCompacter struct {
 403         sparseBlocks  [][]uint16
 404         sparseOffsets []uint16
 405         sparseCount   int
 406 }
 407
 408 // makeSparse returns the number of elements that compact block would contain
 409 // as well as the modified values.
 410 func makeSparse(vals []uint64) ([]uint16, int) {
 411         // Copy the values.
 412         values := make([]uint16, len(vals))
 413         for i, v := range vals {
 414                 values[i] = uint16(v)
 415         }
 416
 417         alt := func(i int, v uint16) uint16 {
 418                 if cm := info(v & fullCasedMask); cm == cUpper || cm == cLower {
 419                         // Convert cLower or cUpper to cXORCase value, which has the form 11x.
 420                         xor := v
 421                         xor &^= 1
 422                         xor |= uint16(i&1) ^ (v & 1)
 423                         xor |= 0x4
 424                         return xor
 425                 }
 426                 return v
 427         }
 428
 429         var count int
 430         var previous uint16
 431         for i, v := range values {
 432                 if v != 0 {
 433                         // Try if the unmodified value is equal to the previous.
 434                         if v == previous {
 435                                 continue
 436                         }
 437
 438                         // Try if the xor-ed value is equal to the previous value.
 439                         a := alt(i, v)
 440                         if a == previous {
 441                                 values[i] = a
 442                                 continue
 443                         }
 444
 445                         // This is a new value.
 446                         count++
 447
 448                         // Use the xor-ed value if it will be identical to the next value.
 449                         if p := i + 1; p < len(values) && alt(p, values[p]) == a {
 450                                 values[i] = a
 451                                 v = a
 452                         }
 453                 }
 454                 previous = v
 455         }
 456         return values, count
 457 }
 458
 459 func (s *sparseCompacter) Size(v []uint64) (int, bool) {
 460         _, n := makeSparse(v)
 461
 462         // We limit using this method to having 16 entries.
 463         if n > 16 {
 464                 return 0, false
 465         }
 466
 467         return 2 + int(reflect.TypeOf(valueRange{}).Size())*n, true
 468 }
 469
 470 func (s *sparseCompacter) Store(v []uint64) uint32 {
 471         h := uint32(len(s.sparseOffsets))
 472         values, sz := makeSparse(v)
 473         s.sparseBlocks = append(s.sparseBlocks, values)
 474         s.sparseOffsets = append(s.sparseOffsets, uint16(s.sparseCount))
 475         s.sparseCount += sz
 476         return h
 477 }
 478
 479 func (s *sparseCompacter) Handler() string {
 480         // The sparse global variable and its lookup method is defined in gen_trieval.go.
 481         return "sparse.lookup"
 482 }
 483
 484 func (s *sparseCompacter) Print(w io.Writer) (retErr error) {
 485         p := func(format string, args ...interface{}) {
 486                 _, err := fmt.Fprintf(w, format, args...)
 487                 if retErr == nil && err != nil {
 488                         retErr = err
 489                 }
 490         }
 491
 492         ls := len(s.sparseBlocks)
 493         if ls == len(s.sparseOffsets) {
 494                 s.sparseOffsets = append(s.sparseOffsets, uint16(s.sparseCount))
 495         }
 496         p("// sparseOffsets: %d entries, %d bytes\n", ls+1, (ls+1)*2)
 497         p("var sparseOffsets = %#v\n\n", s.sparseOffsets)
 498
 499         ns := s.sparseCount
 500         p("// sparseValues: %d entries, %d bytes\n", ns, ns*4)
 501         p("var sparseValues = [%d]valueRange {", ns)
 502         for i, values := range s.sparseBlocks {
 503                 p("\n// Block %#x, offset %#x", i, s.sparseOffsets[i])
 504                 var v uint16
 505                 for i, nv := range values {
 506                         if nv != v {
 507                                 if v != 0 {
 508                                         p(",hi:%#02x},", 0x80+i-1)
 509                                 }
 510                                 if nv != 0 {
 511                                         p("\n{value:%#04x,lo:%#02x", nv, 0x80+i)
 512                                 }
 513                         }
 514                         v = nv
 515                 }
 516                 if v != 0 {
 517                         p(",hi:%#02x},", 0x80+len(values)-1)
 518                 }
 519         }
 520         p("\n}\n\n")
 521         return
 522 }
 523
 524 // verifyProperties that properties of the runes that are relied upon in the
 525 // implementation. Each property is marked with an identifier that is referred
 526 // to in the places where it is used.
 527 func verifyProperties(chars []runeInfo) {
 528         for i, c := range chars {
 529                 r := rune(i)
 530
 531                 // Rune properties.
 532
 533                 // A.1: modifier never changes on lowercase. [ltLower]
 534                 if c.CCC > 0 && unicode.ToLower(r) != r {
 535                         log.Fatalf("%U: non-starter changes when lowercased", r)
 536                 }
 537
 538                 // A.2: properties of decompositions starting with I or J. [ltLower]
 539                 d := norm.NFD.PropertiesString(string(r)).Decomposition()
 540                 if len(d) > 0 {
 541                         if d[0] == 'I' || d[0] == 'J' {
 542                                 // A.2.1: we expect at least an ASCII character and a modifier.
 543                                 if len(d) < 3 {
 544                                         log.Fatalf("%U: length of decomposition was %d; want >= 3", r, len(d))
 545                                 }
 546
 547                                 // All subsequent runes are modifiers and all have the same CCC.
 548                                 runes := []rune(string(d[1:]))
 549                                 ccc := chars[runes[0]].CCC
 550
 551                                 for _, mr := range runes[1:] {
 552                                         mc := chars[mr]
 553
 554                                         // A.2.2: all modifiers have a CCC of Above or less.
 555                                         if ccc == 0 || ccc > above {
 556                                                 log.Fatalf("%U: CCC of successive rune (%U) was %d; want (0,230]", r, mr, ccc)
 557                                         }
 558
 559                                         // A.2.3: a sequence of modifiers all have the same CCC.
 560                                         if mc.CCC != ccc {
 561                                                 log.Fatalf("%U: CCC of follow-up modifier (%U) was %d; want %d", r, mr, mc.CCC, ccc)
 562                                         }
 563
 564                                         // A.2.4: for each trailing r, r in [0x300, 0x311] <=> CCC == Above.
 565                                         if (ccc == above) != (0x300 <= mr && mr <= 0x311) {
 566                                                 log.Fatalf("%U: modifier %U in [U+0300, U+0311] != ccc(%U) == 230", r, mr, mr)
 567                                         }
 568
 569                                         if i += len(string(mr)); i >= len(d) {
 570                                                 break
 571                                         }
 572                                 }
 573                         }
 574                 }
 575
 576                 // A.3: no U+0307 in decomposition of Soft-Dotted rune. [ltUpper]
 577                 if unicode.Is(unicode.Soft_Dotted, r) && strings.Contains(string(d), "\u0307") {
 578                         log.Fatalf("%U: decomposition of soft-dotted rune may not contain U+0307", r)
 579                 }
 580
 581                 // A.4: only rune U+0345 may be of CCC Iota_Subscript. [elUpper]
 582                 if c.CCC == iotaSubscript && r != 0x0345 {
 583                         log.Fatalf("%U: only rune U+0345 may have CCC Iota_Subscript", r)
 584                 }
 585
 586                 // A.5: soft-dotted runes do not have exceptions.
 587                 if c.SoftDotted && c.entry&exceptionBit != 0 {
 588                         log.Fatalf("%U: soft-dotted has exception", r)
 589                 }
 590
 591                 // A.6: Greek decomposition. [elUpper]
 592                 if unicode.Is(unicode.Greek, r) {
 593                         if b := norm.NFD.PropertiesString(string(r)).Decomposition(); b != nil {
 594                                 runes := []rune(string(b))
 595                                 // A.6.1: If a Greek rune decomposes and the first rune of the
 596                                 // decomposition is greater than U+00FF, the rune is always
 597                                 // great and not a modifier.
 598                                 if f := runes[0]; unicode.IsMark(f) || f > 0xFF && !unicode.Is(unicode.Greek, f) {
 599                                         log.Fatalf("%U: expected first rune of Greek decomposition to be letter, found %U", r, f)
 600                                 }
 601                                 // A.6.2: Any follow-up rune in a Greek decomposition is a
 602                                 // modifier of which the first should be gobbled in
 603                                 // decomposition.
 604                                 for _, m := range runes[1:] {
 605                                         switch m {
 606                                         case 0x0313, 0x0314, 0x0301, 0x0300, 0x0306, 0x0342, 0x0308, 0x0304, 0x345:
 607                                         default:
 608                                                 log.Fatalf("%U: modifier %U is outside of expected Greek modifier set", r, m)
 609                                         }
 610                                 }
 611                         }
 612                 }
 613
 614                 // Breaking properties.
 615
 616                 // B.1: all runes with CCC > 0 are of break type Extend.
 617                 if c.CCC > 0 && c.BreakType != "Extend" {
 618                         log.Fatalf("%U: CCC == %d, but got break type %s; want Extend", r, c.CCC, c.BreakType)
 619                 }
 620
 621                 // B.2: all cased runes with c.CCC == 0 are of break type ALetter.
 622                 if c.CCC == 0 && c.Cased && c.BreakType != "ALetter" {
 623                         log.Fatalf("%U: cased, but got break type %s; want ALetter", r, c.BreakType)
 624                 }
 625
 626                 // B.3: letter category.
 627                 if c.CCC == 0 && c.BreakCat != breakBreak && !c.CaseIgnorable {
 628                         if c.BreakCat != breakLetter {
 629                                 log.Fatalf("%U: check for letter break type gave %d; want %d", r, c.BreakCat, breakLetter)
 630                         }
 631                 }
 632         }
 633 }
 634
 635 func genTablesTest() {
 636         w := &bytes.Buffer{}
 637
 638         fmt.Fprintln(w, "var (")
 639         printProperties(w, "DerivedCoreProperties.txt", "Case_Ignorable", verifyIgnore)
 640
 641         // We discard the output as we know we have perfect functions. We run them
 642         // just to verify the properties are correct.
 643         n := printProperties(ioutil.Discard, "DerivedCoreProperties.txt", "Cased", verifyCased)
 644         n += printProperties(ioutil.Discard, "DerivedCoreProperties.txt", "Lowercase", verifyLower)
 645         n += printProperties(ioutil.Discard, "DerivedCoreProperties.txt", "Uppercase", verifyUpper)
 646         if n > 0 {
 647                 log.Fatalf("One of the discarded properties does not have a perfect filter.")
 648         }
 649
 650         // <code>; <lower> ; <title> ; <upper> ; (<condition_list> ;)?
 651         fmt.Fprintln(w, "\tspecial = map[rune]struct{ toLower, toTitle, toUpper string }{")
 652         parse("SpecialCasing.txt", func(p *ucd.Parser) {
 653                 // Skip conditional entries.
 654                 if p.String(4) != "" {
 655                         return
 656                 }
 657                 r := p.Rune(0)
 658                 fmt.Fprintf(w, "\t\t0x%04x: {%q, %q, %q},\n",
 659                         r, string(p.Runes(1)), string(p.Runes(2)), string(p.Runes(3)))
 660         })
 661         fmt.Fprint(w, "\t}\n\n")
 662
 663         // <code>; <type>; <runes>
 664         table := map[rune]struct{ simple, full, special string }{}
 665         parse("CaseFolding.txt", func(p *ucd.Parser) {
 666                 r := p.Rune(0)
 667                 t := p.String(1)
 668                 v := string(p.Runes(2))
 669                 if t != "T" && v == string(unicode.ToLower(r)) {
 670                         return
 671                 }
 672                 x := table[r]
 673                 switch t {
 674                 case "C":
 675                         x.full = v
 676                         x.simple = v
 677                 case "S":
 678                         x.simple = v
 679                 case "F":
 680                         x.full = v
 681                 case "T":
 682                         x.special = v
 683                 }
 684                 table[r] = x
 685         })
 686         fmt.Fprintln(w, "\tfoldMap = map[rune]struct{ simple, full, special string }{")
 687         for r := rune(0); r < 0x10FFFF; r++ {
 688                 x, ok := table[r]
 689                 if !ok {
 690                         continue
 691                 }
 692                 fmt.Fprintf(w, "\t\t0x%04x: {%q, %q, %q},\n", r, x.simple, x.full, x.special)
 693         }
 694         fmt.Fprint(w, "\t}\n\n")
 695
 696         // Break property
 697         notBreak := map[rune]bool{}
 698         parse("auxiliary/WordBreakProperty.txt", func(p *ucd.Parser) {
 699                 switch p.String(1) {
 700                 case "Extend", "Format", "MidLetter", "MidNumLet", "Single_Quote",
 701                         "ALetter", "Hebrew_Letter", "Numeric", "ExtendNumLet", "ZWJ":
 702                         notBreak[p.Rune(0)] = true
 703                 }
 704         })
 705
 706         fmt.Fprintln(w, "\tbreakProp = []struct{ lo, hi rune }{")
 707         inBreak := false
 708         for r := rune(0); r <= lastRuneForTesting; r++ {
 709                 if isBreak := !notBreak[r]; isBreak != inBreak {
 710                         if isBreak {
 711                                 fmt.Fprintf(w, "\t\t{0x%x, ", r)
 712                         } else {
 713                                 fmt.Fprintf(w, "0x%x},\n", r-1)
 714                         }
 715                         inBreak = isBreak
 716                 }
 717         }
 718         if inBreak {
 719                 fmt.Fprintf(w, "0x%x},\n", lastRuneForTesting)
 720         }
 721         fmt.Fprint(w, "\t}\n\n")
 722
 723         // Word break test
 724         // Filter out all samples that do not contain cased characters.
 725         cased := map[rune]bool{}
 726         parse("DerivedCoreProperties.txt", func(p *ucd.Parser) {
 727                 if p.String(1) == "Cased" {
 728                         cased[p.Rune(0)] = true
 729                 }
 730         })
 731
 732         fmt.Fprintln(w, "\tbreakTest = []string{")
 733         parse("auxiliary/WordBreakTest.txt", func(p *ucd.Parser) {
 734                 c := strings.Split(p.String(0), " ")
 735
 736                 const sep = '|'
 737                 numCased := 0
 738                 test := ""
 739                 for ; len(c) >= 2; c = c[2:] {
 740                         if c[0] == "÷" && test != "" {
 741                                 test += string(sep)
 742                         }
 743                         i, err := strconv.ParseUint(c[1], 16, 32)
 744                         r := rune(i)
 745                         if err != nil {
 746                                 log.Fatalf("Invalid rune %q.", c[1])
 747                         }
 748                         if r == sep {
 749                                 log.Fatalf("Separator %q not allowed in test data. Pick another one.", sep)
 750                         }
 751                         if cased[r] {
 752                                 numCased++
 753                         }
 754                         test += string(r)
 755                 }
 756                 if numCased > 1 {
 757                         fmt.Fprintf(w, "\t\t%q,\n", test)
 758                 }
 759         })
 760         fmt.Fprintln(w, "\t}")
 761
 762         fmt.Fprintln(w, ")")
 763
 764         gen.WriteGoFile("tables_test.go", "cases", w.Bytes())
 765 }
 766
 767 // These functions are just used for verification that their definition have not
 768 // changed in the Unicode Standard.
 769
 770 func verifyCased(r rune) bool {
 771         return verifyLower(r) || verifyUpper(r) || unicode.IsTitle(r)
 772 }
 773
 774 func verifyLower(r rune) bool {
 775         return unicode.IsLower(r) || unicode.Is(unicode.Other_Lowercase, r)
 776 }
 777
 778 func verifyUpper(r rune) bool {
 779         return unicode.IsUpper(r) || unicode.Is(unicode.Other_Uppercase, r)
 780 }
 781
 782 // verifyIgnore is an approximation of the Case_Ignorable property using the
 783 // core unicode package. It is used to reduce the size of the test data.
 784 func verifyIgnore(r rune) bool {
 785         props := []*unicode.RangeTable{
 786                 unicode.Mn,
 787                 unicode.Me,
 788                 unicode.Cf,
 789                 unicode.Lm,
 790                 unicode.Sk,
 791         }
 792         for _, p := range props {
 793                 if unicode.Is(p, r) {
 794                         return true
 795                 }
 796         }
 797         return false
 798 }
 799
 800 // printProperties prints tables of rune properties from the given UCD file.
 801 // A filter func f can be given to exclude certain values. A rune r will have
 802 // the indicated property if it is in the generated table or if f(r).
 803 func printProperties(w io.Writer, file, property string, f func(r rune) bool) int {
 804         verify := map[rune]bool{}
 805         n := 0
 806         varNameParts := strings.Split(property, "_")
 807         varNameParts[0] = strings.ToLower(varNameParts[0])
 808         fmt.Fprintf(w, "\t%s = map[rune]bool{\n", strings.Join(varNameParts, ""))
 809         parse(file, func(p *ucd.Parser) {
 810                 if p.String(1) == property {
 811                         r := p.Rune(0)
 812                         verify[r] = true
 813                         if !f(r) {
 814                                 n++
 815                                 fmt.Fprintf(w, "\t\t0x%.4x: true,\n", r)
 816                         }
 817                 }
 818         })
 819         fmt.Fprint(w, "\t}\n\n")
 820
 821         // Verify that f is correct, that is, it represents a subset of the property.
 822         for r := rune(0); r <= lastRuneForTesting; r++ {
 823                 if !verify[r] && f(r) {
 824                         log.Fatalf("Incorrect filter func for property %q.", property)
 825                 }
 826         }
 827         return n
 828 }
 829
 830 // The newCaseTrie, sparseValues and sparseOffsets definitions below are
 831 // placeholders referred to by gen_trieval.go. The real definitions are
 832 // generated by this program and written to tables.go.
 833
 834 func newCaseTrie(int) int { return 0 }
 835
 836 var (
 837         sparseValues  [0]valueRange
 838         sparseOffsets [0]uint16
 839 )