1 // Copyright 2014 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
7 // This program generates the trie for casing operations. The Unicode casing
8 // algorithm requires the lookup of various properties and mappings for each
9 // rune. The table generated by this generator combines several of the most
10 // frequently used of these into a single trie so that they can be accessed
11 // with a single lookup.
25 "golang.org/x/text/internal/gen"
26 "golang.org/x/text/internal/triegen"
27 "golang.org/x/text/internal/ucd"
28 "golang.org/x/text/unicode/norm"
35 gen.Repackage("gen_trieval.go", "trieval.go", "cases")
38 // runeInfo contains all information for a rune that we care about for casing
40 type runeInfo struct {
43 entry info // trie value for this rune.
47 // Simple case mappings.
48 Simple [1 + maxCaseMode][]rune
53 Special [1 + maxCaseMode][]rune
60 // TODO: FC_NFKC, or equivalent data.
68 BreakCat breakCategory
70 // We care mostly about 0, Above, and IotaSubscript.
74 type breakCategory int
77 breakBreak breakCategory = iota
82 // mapping returns the case mapping for the given case type.
83 func (r *runeInfo) mapping(c info) string {
85 return string(r.Special[c])
87 if len(r.Simple[c]) != 0 {
88 return string(r.Simple[c])
93 func parse(file string, f func(p *ucd.Parser)) {
94 ucd.Parse(gen.OpenUCDFile(file), f)
97 func parseUCD() []runeInfo {
98 chars := make([]runeInfo, unicode.MaxRune)
100 get := func(r rune) *runeInfo {
106 parse("UnicodeData.txt", func(p *ucd.Parser) {
108 ri.CCC = byte(p.Int(ucd.CanonicalCombiningClass))
109 ri.Simple[cLower] = p.Runes(ucd.SimpleLowercaseMapping)
110 ri.Simple[cUpper] = p.Runes(ucd.SimpleUppercaseMapping)
111 ri.Simple[cTitle] = p.Runes(ucd.SimpleTitlecaseMapping)
112 if p.String(ucd.GeneralCategory) == "Lt" {
117 // <code>; <property>
118 parse("PropList.txt", func(p *ucd.Parser) {
119 if p.String(1) == "Soft_Dotted" {
120 chars[p.Rune(0)].SoftDotted = true
124 // <code>; <word break type>
125 parse("DerivedCoreProperties.txt", func(p *ucd.Parser) {
128 case "Case_Ignorable":
129 ri.CaseIgnorable = true
139 // <code>; <lower> ; <title> ; <upper> ; (<condition_list> ;)?
140 parse("SpecialCasing.txt", func(p *ucd.Parser) {
141 // We drop all conditional special casing and deal with them manually in
142 // the language-specific case mappers. Rune 0x03A3 is the only one with
143 // a conditional formatting that is not language-specific. However,
144 // dealing with this letter is tricky, especially in a streaming
145 // context, so we deal with it in the Caser for Greek specifically.
147 if p.String(4) == "" {
149 ri.Special[cLower] = p.Runes(1)
150 ri.Special[cTitle] = p.Runes(2)
151 ri.Special[cUpper] = p.Runes(3)
153 ri.Conditional = true
157 // TODO: Use text breaking according to UAX #29.
158 // <code>; <word break type>
159 parse("auxiliary/WordBreakProperty.txt", func(p *ucd.Parser) {
161 ri.BreakType = p.String(1)
163 // We collapse the word breaking properties onto the categories we need.
164 switch p.String(1) { // TODO: officially we need to canonicalize.
165 case "MidLetter", "MidNumLet", "Single_Quote":
166 ri.BreakCat = breakMid
167 if !ri.CaseIgnorable {
168 // finalSigma relies on the fact that all breakMid runes are
169 // also a Case_Ignorable. Revisit this code when this changes.
170 log.Fatalf("Rune %U, which has a break category mid, is not a case ignorable", ri)
172 case "ALetter", "Hebrew_Letter", "Numeric", "Extend", "ExtendNumLet", "Format", "ZWJ":
173 ri.BreakCat = breakLetter
177 // <code>; <type>; <mapping>
178 parse("CaseFolding.txt", func(p *ucd.Parser) {
182 ri.FoldSimple = p.Rune(2)
183 ri.FoldFull = p.Runes(2)
185 ri.FoldSimple = p.Rune(2)
187 ri.FoldSpecial = p.Rune(2)
189 ri.FoldFull = p.Runes(2)
191 log.Fatalf("%U: unknown type: %s", p.Rune(0), p.String(1))
200 verifyProperties(chars)
202 t := triegen.NewTrie("case")
203 for i := range chars {
206 t.Insert(rune(i), uint64(c.entry))
209 w := gen.NewCodeWriter()
210 defer w.WriteGoFile("tables.go", "cases")
212 gen.WriteUnicodeVersion(w)
214 // TODO: write CLDR version after adding a mechanism to detect that the
215 // tables on which the manually created locale-sensitive casing code is
216 // based hasn't changed.
218 w.WriteVar("xorData", string(xorData))
219 w.WriteVar("exceptions", string(exceptionData))
221 sz, err := t.Gen(w, triegen.Compact(&sparseCompacter{}))
228 func makeEntry(ri *runeInfo) {
229 if ri.CaseIgnorable {
231 ri.entry = cIgnorableCased
233 ri.entry = cIgnorableUncased
236 ri.entry = ri.CaseMode
239 // TODO: handle soft-dotted.
243 case 0: // Not_Reordered
257 if ri.CaseMode == cUncased {
261 // Need to do something special.
262 if ri.CaseMode == cTitle || ri.HasSpecial || ri.mapping(cTitle) != ri.mapping(cUpper) {
266 if f := string(ri.FoldFull); len(f) > 0 && f != ri.mapping(cUpper) && f != ri.mapping(cLower) {
271 // Rune is either lowercase or uppercase.
273 orig := string(ri.Rune)
275 if ri.CaseMode == cUpper {
276 mapped = ri.mapping(cLower)
278 mapped = ri.mapping(cUpper)
281 if len(orig) != len(mapped) {
286 if string(ri.FoldFull) == ri.mapping(cUpper) {
287 ri.entry |= inverseFoldBit
292 // Create per-byte XOR mask.
294 for i := 0; i < n; i++ {
295 b = append(b, orig[i]^mapped[i])
298 // Remove leading 0 bytes, but keep at least one byte.
299 for ; len(b) > 1 && b[0] == 0; b = b[1:] {
302 if len(b) == 1 && b[0]&0xc0 == 0 {
303 ri.entry |= info(b[0]) << xorShift
308 x, ok := xorCache[key]
310 xorData = append(xorData, 0) // for detecting start of sequence
311 xorData = append(xorData, b...)
316 ri.entry |= info(x<<xorShift) | xorIndexBit
319 var xorCache = map[string]int{}
321 // xorData contains byte-wise XOR data for the least significant bytes of a
322 // UTF-8 encoded rune. An index points to the last byte. The sequence starts
323 // with a zero terminator.
324 var xorData = []byte{}
326 // See the comments in gen_trieval.go re "the exceptions slice".
327 var exceptionData = []byte{0}
329 // makeException encodes case mappings that cannot be expressed in a simple
331 func makeException(ri *runeInfo) {
332 ccc := ri.entry & cccMask
333 // Set exception bit and retain case type.
335 ri.entry |= exceptionBit
337 if len(exceptionData) >= 1<<numExceptionBits {
338 log.Fatalf("%U:exceptionData too large %x > %d bits", ri.Rune, len(exceptionData), numExceptionBits)
341 // Set the offset in the exceptionData array.
342 ri.entry |= info(len(exceptionData) << exceptionShift)
344 orig := string(ri.Rune)
345 tc := ri.mapping(cTitle)
346 uc := ri.mapping(cUpper)
347 lc := ri.mapping(cLower)
348 ff := string(ri.FoldFull)
350 // addString sets the length of a string and adds it to the expansions array.
351 addString := func(s string, b *byte) {
353 // Zero-length mappings exist, but only for conditional casing,
354 // which we are representing outside of this table.
355 log.Fatalf("%U: has zero-length mapping.", ri.Rune)
361 log.Fatalf("%U: mapping larger than 7 (%d)", ri.Rune, n)
364 exceptionData = append(exceptionData, s...)
369 exceptionData = append(exceptionData, byte(ccc)|byte(len(ff)))
372 p := len(exceptionData)
373 exceptionData = append(exceptionData, 0)
375 if len(ff) > 7 { // May be zero-length.
376 log.Fatalf("%U: fold string larger than 7 (%d)", ri.Rune, len(ff))
378 exceptionData = append(exceptionData, ff...)
381 addString(lc, &exceptionData[p])
384 addString(uc, &exceptionData[p])
387 // If title is the same as upper, we set it to the original string so
388 // that it will be marked as not present. This implies title case is
389 // the same as upper case.
393 addString(tc, &exceptionData[p])
397 // sparseCompacter is a trie value block Compacter. There are many cases where
398 // successive runes alternate between lower- and upper-case. This Compacter
399 // exploits this by adding a special case type where the case value is obtained
400 // from or-ing it with the least-significant bit of the rune, creating large
401 // ranges of equal case values that compress well.
402 type sparseCompacter struct {
403 sparseBlocks [][]uint16
404 sparseOffsets []uint16
408 // makeSparse returns the number of elements that compact block would contain
409 // as well as the modified values.
410 func makeSparse(vals []uint64) ([]uint16, int) {
412 values := make([]uint16, len(vals))
413 for i, v := range vals {
414 values[i] = uint16(v)
417 alt := func(i int, v uint16) uint16 {
418 if cm := info(v & fullCasedMask); cm == cUpper || cm == cLower {
419 // Convert cLower or cUpper to cXORCase value, which has the form 11x.
422 xor |= uint16(i&1) ^ (v & 1)
431 for i, v := range values {
433 // Try if the unmodified value is equal to the previous.
438 // Try if the xor-ed value is equal to the previous value.
445 // This is a new value.
448 // Use the xor-ed value if it will be identical to the next value.
449 if p := i + 1; p < len(values) && alt(p, values[p]) == a {
459 func (s *sparseCompacter) Size(v []uint64) (int, bool) {
460 _, n := makeSparse(v)
462 // We limit using this method to having 16 entries.
467 return 2 + int(reflect.TypeOf(valueRange{}).Size())*n, true
470 func (s *sparseCompacter) Store(v []uint64) uint32 {
471 h := uint32(len(s.sparseOffsets))
472 values, sz := makeSparse(v)
473 s.sparseBlocks = append(s.sparseBlocks, values)
474 s.sparseOffsets = append(s.sparseOffsets, uint16(s.sparseCount))
479 func (s *sparseCompacter) Handler() string {
480 // The sparse global variable and its lookup method is defined in gen_trieval.go.
481 return "sparse.lookup"
484 func (s *sparseCompacter) Print(w io.Writer) (retErr error) {
485 p := func(format string, args ...interface{}) {
486 _, err := fmt.Fprintf(w, format, args...)
487 if retErr == nil && err != nil {
492 ls := len(s.sparseBlocks)
493 if ls == len(s.sparseOffsets) {
494 s.sparseOffsets = append(s.sparseOffsets, uint16(s.sparseCount))
496 p("// sparseOffsets: %d entries, %d bytes\n", ls+1, (ls+1)*2)
497 p("var sparseOffsets = %#v\n\n", s.sparseOffsets)
500 p("// sparseValues: %d entries, %d bytes\n", ns, ns*4)
501 p("var sparseValues = [%d]valueRange {", ns)
502 for i, values := range s.sparseBlocks {
503 p("\n// Block %#x, offset %#x", i, s.sparseOffsets[i])
505 for i, nv := range values {
508 p(",hi:%#02x},", 0x80+i-1)
511 p("\n{value:%#04x,lo:%#02x", nv, 0x80+i)
517 p(",hi:%#02x},", 0x80+len(values)-1)
524 // verifyProperties that properties of the runes that are relied upon in the
525 // implementation. Each property is marked with an identifier that is referred
526 // to in the places where it is used.
527 func verifyProperties(chars []runeInfo) {
528 for i, c := range chars {
533 // A.1: modifier never changes on lowercase. [ltLower]
534 if c.CCC > 0 && unicode.ToLower(r) != r {
535 log.Fatalf("%U: non-starter changes when lowercased", r)
538 // A.2: properties of decompositions starting with I or J. [ltLower]
539 d := norm.NFD.PropertiesString(string(r)).Decomposition()
541 if d[0] == 'I' || d[0] == 'J' {
542 // A.2.1: we expect at least an ASCII character and a modifier.
544 log.Fatalf("%U: length of decomposition was %d; want >= 3", r, len(d))
547 // All subsequent runes are modifiers and all have the same CCC.
548 runes := []rune(string(d[1:]))
549 ccc := chars[runes[0]].CCC
551 for _, mr := range runes[1:] {
554 // A.2.2: all modifiers have a CCC of Above or less.
555 if ccc == 0 || ccc > above {
556 log.Fatalf("%U: CCC of successive rune (%U) was %d; want (0,230]", r, mr, ccc)
559 // A.2.3: a sequence of modifiers all have the same CCC.
561 log.Fatalf("%U: CCC of follow-up modifier (%U) was %d; want %d", r, mr, mc.CCC, ccc)
564 // A.2.4: for each trailing r, r in [0x300, 0x311] <=> CCC == Above.
565 if (ccc == above) != (0x300 <= mr && mr <= 0x311) {
566 log.Fatalf("%U: modifier %U in [U+0300, U+0311] != ccc(%U) == 230", r, mr, mr)
569 if i += len(string(mr)); i >= len(d) {
576 // A.3: no U+0307 in decomposition of Soft-Dotted rune. [ltUpper]
577 if unicode.Is(unicode.Soft_Dotted, r) && strings.Contains(string(d), "\u0307") {
578 log.Fatalf("%U: decomposition of soft-dotted rune may not contain U+0307", r)
581 // A.4: only rune U+0345 may be of CCC Iota_Subscript. [elUpper]
582 if c.CCC == iotaSubscript && r != 0x0345 {
583 log.Fatalf("%U: only rune U+0345 may have CCC Iota_Subscript", r)
586 // A.5: soft-dotted runes do not have exceptions.
587 if c.SoftDotted && c.entry&exceptionBit != 0 {
588 log.Fatalf("%U: soft-dotted has exception", r)
591 // A.6: Greek decomposition. [elUpper]
592 if unicode.Is(unicode.Greek, r) {
593 if b := norm.NFD.PropertiesString(string(r)).Decomposition(); b != nil {
594 runes := []rune(string(b))
595 // A.6.1: If a Greek rune decomposes and the first rune of the
596 // decomposition is greater than U+00FF, the rune is always
597 // great and not a modifier.
598 if f := runes[0]; unicode.IsMark(f) || f > 0xFF && !unicode.Is(unicode.Greek, f) {
599 log.Fatalf("%U: expected first rune of Greek decomposition to be letter, found %U", r, f)
601 // A.6.2: Any follow-up rune in a Greek decomposition is a
602 // modifier of which the first should be gobbled in
604 for _, m := range runes[1:] {
606 case 0x0313, 0x0314, 0x0301, 0x0300, 0x0306, 0x0342, 0x0308, 0x0304, 0x345:
608 log.Fatalf("%U: modifier %U is outside of expected Greek modifier set", r, m)
614 // Breaking properties.
616 // B.1: all runes with CCC > 0 are of break type Extend.
617 if c.CCC > 0 && c.BreakType != "Extend" {
618 log.Fatalf("%U: CCC == %d, but got break type %s; want Extend", r, c.CCC, c.BreakType)
621 // B.2: all cased runes with c.CCC == 0 are of break type ALetter.
622 if c.CCC == 0 && c.Cased && c.BreakType != "ALetter" {
623 log.Fatalf("%U: cased, but got break type %s; want ALetter", r, c.BreakType)
626 // B.3: letter category.
627 if c.CCC == 0 && c.BreakCat != breakBreak && !c.CaseIgnorable {
628 if c.BreakCat != breakLetter {
629 log.Fatalf("%U: check for letter break type gave %d; want %d", r, c.BreakCat, breakLetter)
635 func genTablesTest() {
638 fmt.Fprintln(w, "var (")
639 printProperties(w, "DerivedCoreProperties.txt", "Case_Ignorable", verifyIgnore)
641 // We discard the output as we know we have perfect functions. We run them
642 // just to verify the properties are correct.
643 n := printProperties(ioutil.Discard, "DerivedCoreProperties.txt", "Cased", verifyCased)
644 n += printProperties(ioutil.Discard, "DerivedCoreProperties.txt", "Lowercase", verifyLower)
645 n += printProperties(ioutil.Discard, "DerivedCoreProperties.txt", "Uppercase", verifyUpper)
647 log.Fatalf("One of the discarded properties does not have a perfect filter.")
650 // <code>; <lower> ; <title> ; <upper> ; (<condition_list> ;)?
651 fmt.Fprintln(w, "\tspecial = map[rune]struct{ toLower, toTitle, toUpper string }{")
652 parse("SpecialCasing.txt", func(p *ucd.Parser) {
653 // Skip conditional entries.
654 if p.String(4) != "" {
658 fmt.Fprintf(w, "\t\t0x%04x: {%q, %q, %q},\n",
659 r, string(p.Runes(1)), string(p.Runes(2)), string(p.Runes(3)))
661 fmt.Fprint(w, "\t}\n\n")
663 // <code>; <type>; <runes>
664 table := map[rune]struct{ simple, full, special string }{}
665 parse("CaseFolding.txt", func(p *ucd.Parser) {
668 v := string(p.Runes(2))
669 if t != "T" && v == string(unicode.ToLower(r)) {
686 fmt.Fprintln(w, "\tfoldMap = map[rune]struct{ simple, full, special string }{")
687 for r := rune(0); r < 0x10FFFF; r++ {
692 fmt.Fprintf(w, "\t\t0x%04x: {%q, %q, %q},\n", r, x.simple, x.full, x.special)
694 fmt.Fprint(w, "\t}\n\n")
697 notBreak := map[rune]bool{}
698 parse("auxiliary/WordBreakProperty.txt", func(p *ucd.Parser) {
700 case "Extend", "Format", "MidLetter", "MidNumLet", "Single_Quote",
701 "ALetter", "Hebrew_Letter", "Numeric", "ExtendNumLet", "ZWJ":
702 notBreak[p.Rune(0)] = true
706 fmt.Fprintln(w, "\tbreakProp = []struct{ lo, hi rune }{")
708 for r := rune(0); r <= lastRuneForTesting; r++ {
709 if isBreak := !notBreak[r]; isBreak != inBreak {
711 fmt.Fprintf(w, "\t\t{0x%x, ", r)
713 fmt.Fprintf(w, "0x%x},\n", r-1)
719 fmt.Fprintf(w, "0x%x},\n", lastRuneForTesting)
721 fmt.Fprint(w, "\t}\n\n")
724 // Filter out all samples that do not contain cased characters.
725 cased := map[rune]bool{}
726 parse("DerivedCoreProperties.txt", func(p *ucd.Parser) {
727 if p.String(1) == "Cased" {
728 cased[p.Rune(0)] = true
732 fmt.Fprintln(w, "\tbreakTest = []string{")
733 parse("auxiliary/WordBreakTest.txt", func(p *ucd.Parser) {
734 c := strings.Split(p.String(0), " ")
739 for ; len(c) >= 2; c = c[2:] {
740 if c[0] == "รท" && test != "" {
743 i, err := strconv.ParseUint(c[1], 16, 32)
746 log.Fatalf("Invalid rune %q.", c[1])
749 log.Fatalf("Separator %q not allowed in test data. Pick another one.", sep)
757 fmt.Fprintf(w, "\t\t%q,\n", test)
760 fmt.Fprintln(w, "\t}")
764 gen.WriteGoFile("tables_test.go", "cases", w.Bytes())
767 // These functions are just used for verification that their definition have not
768 // changed in the Unicode Standard.
770 func verifyCased(r rune) bool {
771 return verifyLower(r) || verifyUpper(r) || unicode.IsTitle(r)
774 func verifyLower(r rune) bool {
775 return unicode.IsLower(r) || unicode.Is(unicode.Other_Lowercase, r)
778 func verifyUpper(r rune) bool {
779 return unicode.IsUpper(r) || unicode.Is(unicode.Other_Uppercase, r)
782 // verifyIgnore is an approximation of the Case_Ignorable property using the
783 // core unicode package. It is used to reduce the size of the test data.
784 func verifyIgnore(r rune) bool {
785 props := []*unicode.RangeTable{
792 for _, p := range props {
793 if unicode.Is(p, r) {
800 // printProperties prints tables of rune properties from the given UCD file.
801 // A filter func f can be given to exclude certain values. A rune r will have
802 // the indicated property if it is in the generated table or if f(r).
803 func printProperties(w io.Writer, file, property string, f func(r rune) bool) int {
804 verify := map[rune]bool{}
806 varNameParts := strings.Split(property, "_")
807 varNameParts[0] = strings.ToLower(varNameParts[0])
808 fmt.Fprintf(w, "\t%s = map[rune]bool{\n", strings.Join(varNameParts, ""))
809 parse(file, func(p *ucd.Parser) {
810 if p.String(1) == property {
815 fmt.Fprintf(w, "\t\t0x%.4x: true,\n", r)
819 fmt.Fprint(w, "\t}\n\n")
821 // Verify that f is correct, that is, it represents a subset of the property.
822 for r := rune(0); r <= lastRuneForTesting; r++ {
823 if !verify[r] && f(r) {
824 log.Fatalf("Incorrect filter func for property %q.", property)
830 // The newCaseTrie, sparseValues and sparseOffsets definitions below are
831 // placeholders referred to by gen_trieval.go. The real definitions are
832 // generated by this program and written to tables.go.
834 func newCaseTrie(int) int { return 0 }
837 sparseValues [0]valueRange
838 sparseOffsets [0]uint16