1 // Copyright 2012 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
14 "golang.org/x/text/internal/colltab"
15 "golang.org/x/text/unicode/norm"
18 type logicalAnchor int
21 firstAnchor logicalAnchor = -1
26 // entry is used to keep track of a single entry in the collation element table
27 // during building. Examples of entries can be found in the Default Unicode
28 // Collation Element Table.
29 // See http://www.unicode.org/Public/UCA/6.0.0/allkeys.txt.
31 str string // same as string(runes)
33 elems []rawCE // the collation elements
34 extend string // weights of extend to be appended to elems
35 before bool // weights relative to next instead of previous.
36 lock bool // entry is used in extension and can no longer be moved.
38 // prev, next, and level are used to keep track of tailorings.
40 level colltab.Level // next differs at this level
41 skipRemove bool // do not unlink when removed
43 decompose bool // can use NFKD decomposition to generate elems
44 exclude bool // do not include in table
45 implicit bool // derived, is not included in the list
46 modified bool // entry was modified in tailoring
49 expansionIndex int // used to store index into expansion table
50 contractionHandle ctHandle
51 contractionIndex int // index into contraction elements
54 func (e *entry) String() string {
55 return fmt.Sprintf("%X (%q) -> %X (ch:%x; ci:%d, ei:%d)",
56 e.runes, e.str, e.elems, e.contractionHandle, e.contractionIndex, e.expansionIndex)
59 func (e *entry) skip() bool {
60 return e.contraction()
63 func (e *entry) expansion() bool {
64 return !e.decompose && len(e.elems) > 1
67 func (e *entry) contraction() bool {
68 return len(e.runes) > 1
71 func (e *entry) contractionStarter() bool {
72 return e.contractionHandle.n != 0
75 // nextIndexed gets the next entry that needs to be stored in the table.
76 // It returns the entry and the collation level at which the next entry differs
77 // from the current entry.
78 // Entries that can be explicitly derived and logical reset positions are
79 // examples of entries that will not be indexed.
80 func (e *entry) nextIndexed() (*entry, colltab.Level) {
82 for e = e.next; e != nil && (e.exclude || len(e.elems) == 0); e = e.next {
90 // remove unlinks entry e from the sorted chain and clears the collation
91 // elements. e may not be at the front or end of the list. This should always
92 // be the case, as the front and end of the list are always logical anchors,
93 // which may not be removed.
94 func (e *entry) remove() {
95 if e.logical != noAnchor {
96 log.Fatalf("may not remove anchor %q", e.str)
98 // TODO: need to set e.prev.level to e.level if e.level is smaller?
111 // insertAfter inserts n after e.
112 func (e *entry) insertAfter(n *entry) {
117 panic("unexpected nil anchor")
120 n.decompose = false // redo decomposition test
130 // insertBefore inserts n before e.
131 func (e *entry) insertBefore(n *entry) {
136 panic("unexpected nil anchor")
139 n.decompose = false // redo decomposition test
149 func (e *entry) encodeBase() (ce uint32, err error) {
152 ce, err = makeExpandIndex(e.expansionIndex)
155 log.Fatal("decompose should be handled elsewhere")
157 ce, err = makeCE(e.elems[0])
162 func (e *entry) encode() (ce uint32, err error) {
164 log.Fatal("cannot build colElem for entry that should be skipped")
168 t1 := e.elems[0].w[2]
170 if len(e.elems) > 1 {
173 ce, err = makeDecompose(t1, t2)
174 case e.contractionStarter():
175 ce, err = makeContractIndex(e.contractionHandle, e.contractionIndex)
177 if len(e.runes) > 1 {
178 log.Fatal("colElem: contractions are handled in contraction trie")
180 ce, err = e.encodeBase()
185 // entryLess returns true if a sorts before b and false otherwise.
186 func entryLess(a, b *entry) bool {
187 if res, _ := compareWeights(a.elems, b.elems); res != 0 {
190 if a.logical != noAnchor {
191 return a.logical == firstAnchor
193 if b.logical != noAnchor {
194 return b.logical == lastAnchor
199 type sortedEntries []*entry
201 func (s sortedEntries) Len() int {
205 func (s sortedEntries) Swap(i, j int) {
206 s[i], s[j] = s[j], s[i]
209 func (s sortedEntries) Less(i, j int) bool {
210 return entryLess(s[i], s[j])
213 type ordering struct {
215 entryMap map[string]*entry
220 // insert inserts e into both entryMap and ordered.
221 // Note that insert simply appends e to ordered. To reattain a sorted
222 // order, o.sort() should be called.
223 func (o *ordering) insert(e *entry) {
224 if e.logical == noAnchor {
225 o.entryMap[e.str] = e
227 // Use key format as used in UCA rules.
228 o.entryMap[fmt.Sprintf("[%s]", e.str)] = e
229 // Also add index entry for XML format.
230 o.entryMap[fmt.Sprintf("<%s/>", strings.Replace(e.str, " ", "_", -1))] = e
232 o.ordered = append(o.ordered, e)
235 // newEntry creates a new entry for the given info and inserts it into
237 func (o *ordering) newEntry(s string, ces []rawCE) *entry {
247 // find looks up and returns the entry for the given string.
248 // It returns nil if str is not in the index and if an implicit value
249 // cannot be derived, that is, if str represents more than one rune.
250 func (o *ordering) find(str string) *entry {
259 if r[0] >= firstHangul && r[0] <= lastHangul {
261 nfd := norm.NFD.String(str)
262 for _, r := range nfd {
263 ce = append(ce, o.find(string(r)).elems...)
265 e = o.newEntry(nfd, ce)
267 e = o.newEntry(string(r[0]), []rawCE{
269 implicitPrimary(r[0]),
278 e.exclude = true // do not index implicits
284 // makeRootOrdering returns a newly initialized ordering value and populates
285 // it with a set of logical reset points that can be used as anchors.
286 // The anchors first_tertiary_ignorable and __END__ will always sort at
287 // the beginning and end, respectively. This means that prev and next are non-nil
288 // for any indexed entry.
289 func makeRootOrdering() ordering {
290 const max = unicode.MaxRune
292 entryMap: make(map[string]*entry),
294 insert := func(typ logicalAnchor, s string, ce []int) {
296 elems: []rawCE{{w: ce}},
303 insert(firstAnchor, "first tertiary ignorable", []int{0, 0, 0, 0})
304 insert(lastAnchor, "last tertiary ignorable", []int{0, 0, 0, max})
305 insert(lastAnchor, "last primary ignorable", []int{0, defaultSecondary, defaultTertiary, max})
306 insert(lastAnchor, "last non ignorable", []int{maxPrimary, defaultSecondary, defaultTertiary, max})
307 insert(lastAnchor, "__END__", []int{1 << maxPrimaryBits, defaultSecondary, defaultTertiary, max})
311 // patchForInsert eleminates entries from the list with more than one collation element.
312 // The next and prev fields of the eliminated entries still point to appropriate
313 // values in the newly created list.
314 // It requires that sort has been called.
315 func (o *ordering) patchForInsert() {
316 for i := 0; i < len(o.ordered)-1; {
320 for ; n != nil && len(n.elems) > 1; n = n.next {
326 for ; o.ordered[i] != n; i++ {
327 o.ordered[i].level = lev
328 o.ordered[i].next = n
329 o.ordered[i+1].prev = e
334 // clone copies all ordering of es into a new ordering value.
335 func (o *ordering) clone() *ordering {
338 entryMap: make(map[string]*entry),
340 for _, e := range o.ordered {
345 decompose: e.decompose,
351 oo.sort() // link all ordering.
356 // front returns the first entry to be indexed.
357 // It assumes that sort() has been called.
358 func (o *ordering) front() *entry {
361 log.Panicf("unexpected first entry: %v", e)
363 // The first entry is always a logical position, which should not be indexed.
364 e, _ = e.nextIndexed()
368 // sort sorts all ordering based on their collation elements and initializes
369 // the prev, next, and level fields accordingly.
370 func (o *ordering) sort() {
371 sort.Sort(sortedEntries(o.ordered))
373 for i := 1; i < len(l); i++ {
376 _, l[k].level = compareWeights(l[k].elems, l[i].elems)
381 // genColElems generates a collation element array from the runes in str. This
382 // assumes that all collation elements have already been added to the Builder.
383 func (o *ordering) genColElems(str string) []rawCE {
385 for _, r := range []rune(str) {
386 for _, ce := range o.find(string(r)).elems {
387 if ce.w[0] != 0 || ce.w[1] != 0 || ce.w[2] != 0 {
388 elems = append(elems, ce)