1 // Copyright 2012 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
5 package build // import "golang.org/x/text/collate/build"
15 "golang.org/x/text/internal/colltab"
16 "golang.org/x/text/language"
17 "golang.org/x/text/unicode/norm"
20 // TODO: optimizations:
21 // - expandElem is currently 20K. By putting unique colElems in a separate
22 // table and having a byte array of indexes into this table, we can reduce
23 // the total size to about 7K. By also factoring out the length bytes, we
24 // can reduce this to about 6K.
25 // - trie valueBlocks are currently 100K. There are a lot of sparse blocks
26 // and many consecutive values with the same stride. This can be further
28 // - Compress secondary weights into 8 bits.
29 // - Some LDML specs specify a context element. Currently we simply concatenate
30 // those. Context can be implemented using the contraction trie. If Builder
31 // could analyze and detect when using a context makes sense, there is no
32 // need to expose this construct in the API.
34 // A Builder builds a root collation table. The user must specify the
35 // collation elements for each entry. A common use will be to base the weights
36 // on those specified in the allkeys* file as provided by the UCA or CLDR.
45 minNonVar int // lowest primary recorded for a variable
46 varTop int // highest primary recorded for a non-variable
48 // indexes used for reusing expansions and contractions
49 expIndex map[string]int // positions of expansions keyed by their string representation
50 ctHandle map[string]ctHandle // contraction handles keyed by a concatenation of the suffixes
51 ctElem map[string]int // contraction elements keyed by their string representation
54 // A Tailoring builds a collation table based on another collation table.
55 // The table is defined by specifying tailorings to the underlying table.
56 // See http://unicode.org/reports/tr35/ for an overview of tailoring
57 // collation tables. The CLDR contains pre-defined tailorings for a variety
58 // of languages (See http://www.unicode.org/Public/cldr/<version>/core.zip.)
59 type Tailoring struct {
68 // NewBuilder returns a new Builder.
69 func NewBuilder() *Builder {
71 index: newTrieBuilder(),
72 root: makeRootOrdering(),
73 expIndex: make(map[string]int),
74 ctHandle: make(map[string]ctHandle),
75 ctElem: make(map[string]int),
79 // Tailoring returns a Tailoring for the given locale. One should
80 // have completed all calls to Add before calling Tailoring.
81 func (b *Builder) Tailoring(loc language.Tag) *Tailoring {
85 index: b.root.clone(),
88 b.locale = append(b.locale, t)
92 // Add adds an entry to the collation element table, mapping
93 // a slice of runes to a sequence of collation elements.
94 // A collation element is specified as list of weights: []int{primary, secondary, ...}.
95 // The entries are typically obtained from a collation element table
96 // as defined in http://www.unicode.org/reports/tr10/#Data_Table_Format.
97 // Note that the collation elements specified by colelems are only used
98 // as a guide. The actual weights generated by Builder may differ.
99 // The argument variables is a list of indices into colelems that should contain
100 // a value for each colelem that is a variable. (See the reference above.)
101 func (b *Builder) Add(runes []rune, colelems [][]int, variables []int) error {
103 elems := make([]rawCE, len(colelems))
104 for i, ce := range colelems {
108 elems[i] = makeRawCE(ce, 0)
110 elems[i].w[1] = defaultSecondary
113 elems[i].w[2] = defaultTertiary
116 elems[i].w[3] = ce[0]
119 for i, ce := range elems {
122 for _, j := range variables {
128 if p >= b.minNonVar && b.minNonVar > 0 {
129 return fmt.Errorf("primary value %X of variable is larger than the smallest non-variable %X", p, b.minNonVar)
134 } else if p > 1 { // 1 is a special primary value reserved for FFFE
136 return fmt.Errorf("primary value %X of non-variable is smaller than the highest variable %X", p, b.varTop)
138 if b.minNonVar == 0 || p < b.minNonVar {
143 elems, err := convertLargeWeights(elems)
148 nfd := norm.NFD.String(str)
150 cccs = append(cccs, norm.NFD.PropertiesString(nfd[i:]).CCC())
152 if len(cccs) < len(elems) {
154 return fmt.Errorf("number of decomposed characters should be greater or equal to the number of collation elements for len(colelems) > 3 (%d < %d)", len(cccs), len(elems))
157 for ; p > 0 && elems[p].w[0] == 0; p-- {
158 elems[p].ccc = cccs[len(cccs)-1]
161 elems[p].ccc = cccs[0]
164 for i := range elems {
165 elems[i].ccc = cccs[i]
168 // doNorm in collate.go assumes that the following conditions hold.
169 if len(elems) > 1 && len(cccs) > 1 && cccs[0] != 0 && cccs[0] != cccs[len(cccs)-1] {
170 return fmt.Errorf("incompatible CCC values for expansion %X (%d)", runes, cccs)
172 b.root.newEntry(str, elems)
176 func (t *Tailoring) setAnchor(anchor string) error {
177 anchor = norm.NFC.String(anchor)
178 a := t.index.find(anchor)
180 a = t.index.newEntry(anchor, nil)
183 for _, r := range []rune(anchor) {
184 e := t.index.find(string(r))
192 // SetAnchor sets the point after which elements passed in subsequent calls to
193 // Insert will be inserted. It is equivalent to the reset directive in an LDML
194 // specification. See Insert for an example.
195 // SetAnchor supports the following logical reset positions:
196 // <first_tertiary_ignorable/>, <last_teriary_ignorable/>, <first_primary_ignorable/>,
197 // and <last_non_ignorable/>.
198 func (t *Tailoring) SetAnchor(anchor string) error {
199 if err := t.setAnchor(anchor); err != nil {
206 // SetAnchorBefore is similar to SetAnchor, except that subsequent calls to
207 // Insert will insert entries before the anchor.
208 func (t *Tailoring) SetAnchorBefore(anchor string) error {
209 if err := t.setAnchor(anchor); err != nil {
216 // Insert sets the ordering of str relative to the entry set by the previous
217 // call to SetAnchor or Insert. The argument extend corresponds
218 // to the extend elements as defined in LDML. A non-empty value for extend
219 // will cause the collation elements corresponding to extend to be appended
220 // to the collation elements generated for the entry added by Insert.
221 // This has the same net effect as sorting str after the string anchor+extend.
222 // See http://www.unicode.org/reports/tr10/#Tailoring_Example for details
223 // on parametric tailoring and http://unicode.org/reports/tr35/#Collation_Elements
224 // for full details on LDML.
226 // Examples: create a tailoring for Swedish, where "ä" is ordered after "z"
227 // at the primary sorting level:
228 // t := b.Tailoring("se")
230 // t.Insert(colltab.Primary, "ä", "")
231 // Order "ü" after "ue" at the secondary sorting level:
233 // t.Insert(colltab.Secondary, "ü","")
236 // t.Insert(colltab.Secondary, "ü", "e")
237 // Order "q" afer "ab" at the secondary level and "Q" after "q"
238 // at the tertiary level:
240 // t.Insert(colltab.Secondary, "q", "")
241 // t.Insert(colltab.Tertiary, "Q", "")
242 // Order "b" before "a":
243 // t.SetAnchorBefore("a")
244 // t.Insert(colltab.Primary, "b", "")
245 // Order "0" after the last primary ignorable:
246 // t.SetAnchor("<last_primary_ignorable/>")
247 // t.Insert(colltab.Primary, "0", "")
248 func (t *Tailoring) Insert(level colltab.Level, str, extend string) error {
250 return fmt.Errorf("%s:Insert: no anchor point set for tailoring of %s", t.id, str)
252 str = norm.NFC.String(str)
253 e := t.index.find(str)
255 e = t.index.newEntry(str, nil)
256 } else if e.logical != noAnchor {
257 return fmt.Errorf("%s:Insert: cannot reinsert logical reset position %q", t.id, e.str)
260 return fmt.Errorf("%s:Insert: cannot reinsert element %q", t.id, e.str)
263 // Find the first element after the anchor which differs at a level smaller or
264 // equal to the given level. Then insert at this position.
265 // See http://unicode.org/reports/tr35/#Collation_Elements, Section 5.14.5 for details.
272 for a = a.prev; a.level > level; a = a.prev {
278 for ; a.level > level; a = a.next {
285 // We don't set a to prev itself. This has the effect of the entry
286 // getting new collation elements that are an increment of itself.
287 // This is intentional.
291 e.extend = norm.NFD.String(extend)
299 func (o *ordering) getWeight(e *entry) []rawCE {
300 if len(e.elems) == 0 && e.logical == noAnchor {
302 for _, r := range e.runes {
303 e.elems = append(e.elems, o.getWeight(o.find(string(r)))...)
306 count := [colltab.Identity + 1]int{}
308 for ; a.elems == nil && !a.implicit; a = a.next {
311 e.elems = []rawCE{makeRawCE(a.elems[0].w, a.elems[0].ccc)}
312 for i := colltab.Primary; i < colltab.Quaternary; i++ {
314 e.elems[0].w[i] -= count[i]
319 o.verifyWeights(e.prev, e, e.prev.level)
323 e.elems = nextWeight(prev.level, o.getWeight(prev))
324 o.verifyWeights(e, e.next, e.level)
330 func (o *ordering) addExtension(e *entry) {
331 if ex := o.find(e.extend); ex != nil {
332 e.elems = append(e.elems, ex.elems...)
334 for _, r := range []rune(e.extend) {
335 e.elems = append(e.elems, o.find(string(r)).elems...)
341 func (o *ordering) verifyWeights(a, b *entry, level colltab.Level) error {
342 if level == colltab.Identity || b == nil || b.elems == nil || a.elems == nil {
345 for i := colltab.Primary; i < level; i++ {
346 if a.elems[0].w[i] < b.elems[0].w[i] {
350 if a.elems[0].w[level] >= b.elems[0].w[level] {
351 err := fmt.Errorf("%s:overflow: collation elements of %q (%X) overflows those of %q (%X) at level %d (%X >= %X)", o.id, a.str, a.runes, b.str, b.runes, level, a.elems, b.elems)
353 // TODO: return the error instead, or better, fix the conflicting entry by making room.
358 func (b *Builder) error(e error) {
364 func (b *Builder) errorID(locale string, e error) {
366 b.err = fmt.Errorf("%s:%v", locale, e)
370 // patchNorm ensures that NFC and NFD counterparts are consistent.
371 func (o *ordering) patchNorm() {
372 // Insert the NFD counterparts, if necessary.
373 for _, e := range o.ordered {
374 nfd := norm.NFD.String(e.str)
376 if e0 := o.find(nfd); e0 != nil && !e0.modified {
378 } else if e.modified && !equalCEArrays(o.genColElems(nfd), e.elems) {
379 e := o.newEntry(nfd, e.elems)
384 // Update unchanged composed forms if one of their parts changed.
385 for _, e := range o.ordered {
386 nfd := norm.NFD.String(e.str)
387 if e.modified || nfd == e.str {
390 if e0 := o.find(nfd); e0 != nil {
393 e.elems = o.genColElems(nfd)
394 if norm.NFD.LastBoundary([]byte(nfd)) == 0 {
398 for i := 1; i < len(r); i++ {
399 s := norm.NFC.String(head + string(r[i]))
400 if e0 := o.find(s); e0 != nil && e0.modified {
406 e.elems = append(o.genColElems(head), o.genColElems(tail)...)
410 // Exclude entries for which the individual runes generate the same collation elements.
411 for _, e := range o.ordered {
412 if len(e.runes) > 1 && equalCEArrays(o.genColElems(e.str), e.elems) {
418 func (b *Builder) buildOrdering(o *ordering) {
419 for _, e := range o.ordered {
422 for _, e := range o.ordered {
428 b.processExpansions(o) // requires simplify
429 b.processContractions(o) // requires simplify
432 for e := o.front(); e != nil; e, _ = e.nextIndexed() {
434 ce, err := e.encode()
436 t.insert(e.runes[0], ce)
439 o.handle = b.index.addTrie(t)
442 func (b *Builder) build() (*table, error) {
448 Table: colltab.Table{
449 MaxContractLen: utf8.UTFMax,
450 VariableTop: uint32(b.varTop),
454 b.buildOrdering(&b.root)
455 b.t.root = b.root.handle
456 for _, t := range b.locale {
457 b.buildOrdering(t.index)
462 i, err := b.index.generate()
464 b.t.Index = colltab.Trie{
467 Index0: i.index[blockSize*b.t.root.lookupStart:],
468 Values0: i.values[blockSize*b.t.root.valueStart:],
474 // Build builds the root Collator.
475 func (b *Builder) Build() (colltab.Weighter, error) {
476 table, err := b.build()
483 // Build builds a Collator for Tailoring t.
484 func (t *Tailoring) Build() (colltab.Weighter, error) {
489 // Print prints the tables for b and all its Tailorings as a Go file
490 // that can be included in the Collate package.
491 func (b *Builder) Print(w io.Writer) (n int, err error) {
492 p := func(nn int, e error) {
502 p(fmt.Fprintf(w, `var availableLocales = "und`))
503 for _, loc := range b.locale {
505 p(fmt.Fprintf(w, ",%s", loc.id))
508 p(fmt.Fprint(w, "\"\n\n"))
509 p(fmt.Fprintf(w, "const varTop = 0x%x\n\n", b.varTop))
510 p(fmt.Fprintln(w, "var locales = [...]tableIndex{"))
511 for _, loc := range b.locale {
513 p(t.fprintIndex(w, loc.index.handle, loc.id))
516 for _, loc := range b.locale {
518 p(t.fprintIndex(w, loc.index.handle, loc.id))
521 p(fmt.Fprint(w, "}\n\n"))
522 n, _, err = t.fprint(w, "main")
526 // reproducibleFromNFKD checks whether the given expansion could be generated
527 // from an NFKD expansion.
528 func reproducibleFromNFKD(e *entry, exp, nfkd []rawCE) bool {
529 // Length must be equal.
530 if len(exp) != len(nfkd) {
533 for i, ce := range exp {
534 // Primary and secondary values should be equal.
535 if ce.w[0] != nfkd[i].w[0] || ce.w[1] != nfkd[i].w[1] {
538 // Tertiary values should be equal to maxTertiary for third element onwards.
539 // TODO: there seem to be a lot of cases in CLDR (e.g. ㏭ in zh.xml) that can
540 // simply be dropped. Try this out by dropping the following code.
541 if i >= 2 && ce.w[2] != maxTertiary {
544 if _, err := makeCE(ce); err != nil {
545 // Simply return false. The error will be caught elsewhere.
552 func simplify(o *ordering) {
553 // Runes that are a starter of a contraction should not be removed.
554 // (To date, there is only Kannada character 0CCA.)
555 keep := make(map[rune]bool)
556 for e := o.front(); e != nil; e, _ = e.nextIndexed() {
557 if len(e.runes) > 1 {
558 keep[e.runes[0]] = true
561 // Tag entries for which the runes NFKD decompose to identical values.
562 for e := o.front(); e != nil; e, _ = e.nextIndexed() {
564 nfkd := norm.NFKD.String(s)
565 nfd := norm.NFD.String(s)
566 if e.decompose || len(e.runes) > 1 || len(e.elems) == 1 || keep[e.runes[0]] || nfkd == nfd {
569 if reproducibleFromNFKD(e, e.elems, o.genColElems(nfkd)) {
575 // appendExpansion converts the given collation sequence to
576 // collation elements and adds them to the expansion table.
577 // It returns an index to the expansion table.
578 func (b *Builder) appendExpansion(e *entry) int {
580 i := len(t.ExpandElem)
581 ce := uint32(len(e.elems))
582 t.ExpandElem = append(t.ExpandElem, ce)
583 for _, w := range e.elems {
589 t.ExpandElem = append(t.ExpandElem, ce)
594 // processExpansions extracts data necessary to generate
595 // the extraction tables.
596 func (b *Builder) processExpansions(o *ordering) {
597 for e := o.front(); e != nil; e, _ = e.nextIndexed() {
601 key := fmt.Sprintf("%v", e.elems)
602 i, ok := b.expIndex[key]
604 i = b.appendExpansion(e)
611 func (b *Builder) processContractions(o *ordering) {
612 // Collate contractions per starter rune.
614 cm := make(map[rune][]*entry)
615 for e := o.front(); e != nil; e, _ = e.nextIndexed() {
617 if len(e.str) > b.t.MaxContractLen {
618 b.t.MaxContractLen = len(e.str)
621 if _, ok := cm[r]; !ok {
622 starters = append(starters, r)
624 cm[r] = append(cm[r], e)
627 // Add entries of single runes that are at a start of a contraction.
628 for e := o.front(); e != nil; e, _ = e.nextIndexed() {
629 if !e.contraction() {
631 if _, ok := cm[r]; ok {
632 cm[r] = append(cm[r], e)
636 // Build the tries for the contractions.
638 for _, r := range starters {
640 // Compute suffix strings. There are 31 different contraction suffix
641 // sets for 715 contractions and 82 contraction starter runes as of
645 for _, e := range l {
646 if len(e.runes) > 1 {
647 sufx = append(sufx, string(e.runes[1:]))
653 b.error(fmt.Errorf("no single entry for starter rune %U found", r))
656 // Unique the suffix set.
658 key := strings.Join(sufx, "\n")
659 handle, ok := b.ctHandle[key]
662 handle, err = appendTrie(&t.ContractTries, sufx)
666 b.ctHandle[key] = handle
668 // Bucket sort entries in index order.
669 es := make([]*entry, len(l))
670 for _, e := range l {
672 if len(e.runes) > 1 {
673 str := []byte(string(e.runes[1:]))
674 p, sn = lookup(&t.ContractTries, handle, str)
676 log.Fatalf("%s: processContractions: unexpected length for '%X'; len=%d; want %d", o.id, e.runes, sn, len(str))
680 log.Fatalf("%s: multiple contractions for position %d for rune %U", o.id, p, e.runes[0])
684 // Create collation elements for contractions.
686 for _, e := range es {
687 ce, err := e.encodeBase()
689 elems = append(elems, ce)
691 key = fmt.Sprintf("%v", elems)
692 i, ok := b.ctElem[key]
694 i = len(t.ContractElem)
696 t.ContractElem = append(t.ContractElem, elems...)
698 // Store info in entry for starter rune.
699 es[0].contractionIndex = i
700 es[0].contractionHandle = handle