1 // Copyright 2012 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
11 "golang.org/x/text/internal/colltab"
15 defaultSecondary = 0x20
25 func makeRawCE(w []int, ccc uint8) rawCE {
26 ce := rawCE{w: make([]int, 4), ccc: ccc}
31 // A collation element is represented as an uint32.
32 // In the typical case, a rune maps to a single collation element. If a rune
33 // can be the start of a contraction or expands into multiple collation elements,
34 // then the collation element that is associated with a rune will have a special
35 // form to represent such m to n mappings. Such special collation elements
36 // have a value >= 0x80000000.
44 func makeCE(ce rawCE) (uint32, error) {
45 v, e := colltab.MakeElem(ce.w[0], ce.w[1], ce.w[2], ce.ccc)
49 // For contractions, collation elements are of the form
50 // 110bbbbb bbbbbbbb iiiiiiii iiiinnnn, where
51 // - n* is the size of the first node in the contraction trie.
52 // - i* is the index of the first node in the contraction trie.
53 // - b* is the offset into the contraction collation element table.
54 // See contract.go for details on the contraction trie.
56 contractID = 0xC0000000
59 maxContractOffsetBits = 13
62 func makeContractIndex(h ctHandle, offset int) (uint32, error) {
63 if h.n >= 1<<maxNBits {
64 return 0, fmt.Errorf("size of contraction trie node too large: %d >= %d", h.n, 1<<maxNBits)
66 if h.index >= 1<<maxTrieIndexBits {
67 return 0, fmt.Errorf("size of contraction trie offset too large: %d >= %d", h.index, 1<<maxTrieIndexBits)
69 if offset >= 1<<maxContractOffsetBits {
70 return 0, fmt.Errorf("contraction offset out of bounds: %x >= %x", offset, 1<<maxContractOffsetBits)
72 ce := uint32(contractID)
73 ce += uint32(offset << (maxNBits + maxTrieIndexBits))
74 ce += uint32(h.index << maxNBits)
79 // For expansions, collation elements are of the form
80 // 11100000 00000000 bbbbbbbb bbbbbbbb,
81 // where b* is the index into the expansion sequence table.
84 maxExpandIndexBits = 16
87 func makeExpandIndex(index int) (uint32, error) {
88 if index >= 1<<maxExpandIndexBits {
89 return 0, fmt.Errorf("expansion index out of bounds: %x >= %x", index, 1<<maxExpandIndexBits)
91 return expandID + uint32(index), nil
94 // Each list of collation elements corresponding to an expansion starts with
95 // a header indicating the length of the sequence.
96 func makeExpansionHeader(n int) (uint32, error) {
100 // Some runes can be expanded using NFKD decomposition. Instead of storing the full
101 // sequence of collation elements, we decompose the rune and lookup the collation
102 // elements for each rune in the decomposition and modify the tertiary weights.
103 // The collation element, in this case, is of the form
104 // 11110000 00000000 wwwwwwww vvvvvvvv, where
105 // - v* is the replacement tertiary weight for the first rune,
106 // - w* is the replacement tertiary weight for the second rune,
107 // Tertiary weights of subsequent runes should be replaced with maxTertiary.
108 // See http://www.unicode.org/reports/tr10/#Compatibility_Decompositions for more details.
110 decompID = 0xF0000000
113 func makeDecompose(t1, t2 int) (uint32, error) {
114 if t1 >= 256 || t1 < 0 {
115 return 0, fmt.Errorf("first tertiary weight out of bounds: %d >= 256", t1)
117 if t2 >= 256 || t2 < 0 {
118 return 0, fmt.Errorf("second tertiary weight out of bounds: %d >= 256", t2)
120 return uint32(t2<<8+t1) + decompID, nil
124 // These constants were taken from http://www.unicode.org/versions/Unicode6.0.0/ch12.pdf.
125 minUnified rune = 0x4E00
127 minCompatibility = 0xF900
128 maxCompatibility = 0xFAFF
133 commonUnifiedOffset = 0x10000
134 rareUnifiedOffset = 0x20000 // largest rune in common is U+FAFF
135 otherOffset = 0x50000 // largest rune in rare is U+2FA1D
136 illegalOffset = otherOffset + int(unicode.MaxRune)
137 maxPrimary = illegalOffset + 1
140 // implicitPrimary returns the primary weight for the a rune
141 // for which there is no entry for the rune in the collation table.
142 // We take a different approach from the one specified in
143 // http://unicode.org/reports/tr10/#Implicit_Weights,
144 // but preserve the resulting relative ordering of the runes.
145 func implicitPrimary(r rune) int {
146 if unicode.Is(unicode.Ideographic, r) {
147 if r >= minUnified && r <= maxUnified {
148 // The most common case for CJK.
149 return int(r) + commonUnifiedOffset
151 if r >= minCompatibility && r <= maxCompatibility {
152 // This will typically not hit. The DUCET explicitly specifies mappings
153 // for all characters that do not decompose.
154 return int(r) + commonUnifiedOffset
156 return int(r) + rareUnifiedOffset
158 return int(r) + otherOffset
161 // convertLargeWeights converts collation elements with large
162 // primaries (either double primaries or for illegal runes)
163 // to our own representation.
164 // A CJK character C is represented in the DUCET as
165 // [.FBxx.0020.0002.C][.BBBB.0000.0000.C]
166 // We will rewrite these characters to a single CE.
167 // We assume the CJK values start at 0x8000.
168 // See http://unicode.org/reports/tr10/#Implicit_Weights
169 func convertLargeWeights(elems []rawCE) (res []rawCE, err error) {
171 cjkPrimaryStart = 0xFB40
172 rarePrimaryStart = 0xFB80
173 otherPrimaryStart = 0xFBC0
174 illegalPrimary = 0xFFFE
180 for i := 0; i < len(elems); i++ {
183 if p < cjkPrimaryStart {
187 return elems, fmt.Errorf("found primary weight %X; should be <= 0xFFFF", p)
189 if p >= illegalPrimary {
190 ce[0] = illegalOffset + p - illegalPrimary
192 if i+1 >= len(elems) {
193 return elems, fmt.Errorf("second part of double primary weight missing: %v", elems)
195 if elems[i+1].w[0]&lowBitsFlag == 0 {
196 return elems, fmt.Errorf("malformed second part of double primary weight: %v", elems)
198 np := ((p & highBitsMask) << shiftBits) + elems[i+1].w[0]&lowBitsMask
200 case p < rarePrimaryStart:
201 np += commonUnifiedOffset
202 case p < otherPrimaryStart:
203 np += rareUnifiedOffset
208 for j := i + 1; j+1 < len(elems); j++ {
209 elems[j] = elems[j+1]
211 elems = elems[:len(elems)-1]
217 // nextWeight computes the first possible collation weights following elems
218 // for the given level.
219 func nextWeight(level colltab.Level, elems []rawCE) []rawCE {
220 if level == colltab.Identity {
221 next := make([]rawCE, len(elems))
225 next := []rawCE{makeRawCE(elems[0].w, elems[0].ccc)}
227 if level < colltab.Secondary {
228 next[0].w[colltab.Secondary] = defaultSecondary
230 if level < colltab.Tertiary {
231 next[0].w[colltab.Tertiary] = defaultTertiary
233 // Filter entries that cannot influence ordering.
234 for _, ce := range elems[1:] {
236 for i := colltab.Primary; i < level; i++ {
237 skip = skip && ce.w[i] == 0
240 next = append(next, ce)
246 func nextVal(elems []rawCE, i int, level colltab.Level) (index, value int) {
247 for ; i < len(elems) && elems[i].w[level] == 0; i++ {
250 return i, elems[i].w[level]
255 // compareWeights returns -1 if a < b, 1 if a > b, or 0 otherwise.
256 // It also returns the collation level at which the difference is found.
257 func compareWeights(a, b []rawCE) (result int, level colltab.Level) {
258 for level := colltab.Primary; level < colltab.Identity; level++ {
260 for ia, ib := 0, 0; ia < len(a) || ib < len(b); ia, ib = ia+1, ib+1 {
261 ia, va = nextVal(a, ia, level)
262 ib, vb = nextVal(b, ib, level)
272 return 0, colltab.Identity
275 func equalCE(a, b rawCE) bool {
276 for i := 0; i < 3; i++ {
277 if b.w[i] != a.w[i] {
284 func equalCEArrays(a, b []rawCE) bool {
285 if len(a) != len(b) {
289 if !equalCE(a[i], b[i]) {