1 // Copyright 2012 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
10 "golang.org/x/text/collate/build"
11 "golang.org/x/text/internal/colltab"
12 "golang.org/x/text/unicode/norm"
15 type ColElems []Weights
28 type tableTest struct {
33 func w(ce ...int) Weights {
39 func pt(p, t int) []int {
40 return []int{p, defaults.Secondary, t}
43 func makeTable(in []input) (*Collator, error) {
44 b := build.NewBuilder()
45 for _, r := range in {
46 if e := b.Add([]rune(r.str), r.ces, nil); e != nil {
54 return NewFromTable(t), nil
57 // modSeq holds a seqeunce of modifiers in increasing order of CCC long enough
58 // to cause a segment overflow if not handled correctly. The last rune in this
59 // list has a CCC of 214.
61 0x05B1, 0x05B2, 0x05B3, 0x05B4, 0x05B5, 0x05B6, 0x05B7, 0x05B8, 0x05B9, 0x05BB,
62 0x05BC, 0x05BD, 0x05BF, 0x05C1, 0x05C2, 0xFB1E, 0x064B, 0x064C, 0x064D, 0x064E,
63 0x064F, 0x0650, 0x0651, 0x0652, 0x0670, 0x0711, 0x0C55, 0x0C56, 0x0E38, 0x0E48,
64 0x0EB8, 0x0EC8, 0x0F71, 0x0F72, 0x0F74, 0x0321, 0x1DCE,
68 var modW = func() ColElems {
70 for _, r := range modSeq {
71 rune := norm.NFC.PropertiesString(string(r))
72 ws = append(ws, w(0, int(rune.CCC())))
73 mods = append(mods, input{string(r), [][]int{{0, int(rune.CCC())}}})
78 var appendNextTests = []tableTest{
81 {"a", [][]int{{100}}},
82 {"b", [][]int{{105}}},
83 {"c", [][]int{{110}}},
84 {"ß", [][]int{{120}}},
87 {"a", 1, ColElems{w(100)}},
88 {"b", 1, ColElems{w(105)}},
89 {"c", 1, ColElems{w(110)}},
90 {"d", 1, ColElems{w(0x50064)}},
91 {"ab", 1, ColElems{w(100)}},
92 {"bc", 1, ColElems{w(105)}},
93 {"dd", 1, ColElems{w(0x50064)}},
94 {"ß", 2, ColElems{w(120)}},
99 {"u", [][]int{{100}}},
100 {"U", [][]int{{100}, {0, 25}}},
101 {"w", [][]int{{100}, {100}}},
102 {"W", [][]int{{100}, {0, 25}, {100}, {0, 25}}},
105 {"u", 1, ColElems{w(100)}},
106 {"U", 1, ColElems{w(100), w(0, 25)}},
107 {"w", 1, ColElems{w(100), w(100)}},
108 {"W", 1, ColElems{w(100), w(0, 25), w(100), w(0, 25)}},
113 {"D", [][]int{pt(104, 8)}},
114 {"z", [][]int{pt(130, 8)}},
115 {"\u030C", [][]int{{0, 40}}}, // Caron
116 {"\u01C5", [][]int{pt(104, 9), pt(130, 4), {0, 40, 0x1F}}}, // Dž = D+z+caron
119 {"\u01C5", 2, ColElems{w(pt(104, 9)...), w(pt(130, 4)...), w(0, 40, 0x1F)}},
122 { // test basic contraction
124 {"a", [][]int{{100}}},
125 {"ab", [][]int{{101}}},
126 {"aab", [][]int{{101}, {101}}},
127 {"abc", [][]int{{102}}},
128 {"b", [][]int{{200}}},
129 {"c", [][]int{{300}}},
130 {"d", [][]int{{400}}},
133 {"a", 1, ColElems{w(100)}},
134 {"aa", 1, ColElems{w(100)}},
135 {"aac", 1, ColElems{w(100)}},
136 {"d", 1, ColElems{w(400)}},
137 {"ab", 2, ColElems{w(101)}},
138 {"abb", 2, ColElems{w(101)}},
139 {"aab", 3, ColElems{w(101), w(101)}},
140 {"aaba", 3, ColElems{w(101), w(101)}},
141 {"abc", 3, ColElems{w(102)}},
142 {"abcd", 3, ColElems{w(102)}},
145 { // test discontinuous contraction
146 append(mods, []input{
147 // modifiers; secondary weight equals ccc
148 {"\u0316", [][]int{{0, 220}}},
149 {"\u0317", [][]int{{0, 220}, {0, 220}}},
150 {"\u302D", [][]int{{0, 222}}},
151 {"\u302E", [][]int{{0, 225}}}, // used as starter
152 {"\u302F", [][]int{{0, 224}}}, // used as starter
153 {"\u18A9", [][]int{{0, 228}}},
154 {"\u0300", [][]int{{0, 230}}},
155 {"\u0301", [][]int{{0, 230}}},
156 {"\u0315", [][]int{{0, 232}}},
157 {"\u031A", [][]int{{0, 232}}},
158 {"\u035C", [][]int{{0, 233}}},
159 {"\u035F", [][]int{{0, 233}}},
160 {"\u035D", [][]int{{0, 234}}},
161 {"\u035E", [][]int{{0, 234}}},
162 {"\u0345", [][]int{{0, 240}}},
165 {"a", [][]int{{100}}},
166 {"b", [][]int{{200}}},
167 {"c", [][]int{{300}}},
168 {"\u03B1", [][]int{{900}}},
169 {"\x01", [][]int{{0, 0, 0, 0}}},
172 {"a\u0300", [][]int{{101}}},
173 {"a\u0301", [][]int{{102}}},
174 {"a\u035E", [][]int{{110}}},
175 {"a\u035Eb\u035E", [][]int{{115}}},
176 {"ac\u035Eaca\u035E", [][]int{{116}}},
177 {"a\u035Db\u035D", [][]int{{117}}},
178 {"a\u0301\u035Db", [][]int{{120}}},
179 {"a\u0301\u035F", [][]int{{121}}},
180 {"a\u0301\u035Fb", [][]int{{119}}},
181 {"\u03B1\u0345", [][]int{{901}, {902}}},
182 {"\u302E\u302F", [][]int{{0, 131}, {0, 131}}},
183 {"\u302F\u18A9", [][]int{{0, 130}}},
186 {"a\x01\u0300", 1, ColElems{w(100)}},
187 {"ab", 1, ColElems{w(100)}}, // closing segment
188 {"a\u0316\u0300b", 5, ColElems{w(101), w(0, 220)}}, // closing segment
189 {"a\u0316\u0300", 5, ColElems{w(101), w(0, 220)}}, // no closing segment
190 {"a\u0316\u0300\u035Cb", 5, ColElems{w(101), w(0, 220)}}, // completes before segment end
191 {"a\u0316\u0300\u035C", 5, ColElems{w(101), w(0, 220)}}, // completes before segment end
193 {"a\u0316\u0301b", 5, ColElems{w(102), w(0, 220)}}, // closing segment
194 {"a\u0316\u0301", 5, ColElems{w(102), w(0, 220)}}, // no closing segment
195 {"a\u0316\u0301\u035Cb", 5, ColElems{w(102), w(0, 220)}}, // completes before segment end
196 {"a\u0316\u0301\u035C", 5, ColElems{w(102), w(0, 220)}}, // completes before segment end
198 // match blocked by modifier with same ccc
199 {"a\u0301\u0315\u031A\u035Fb", 3, ColElems{w(102)}},
202 {"a\u0301\u035Db", 6, ColElems{w(120)}},
203 {"a\u0301\u035F", 5, ColElems{w(121)}},
204 {"a\u0301\u035Fb", 6, ColElems{w(119)}},
205 {"a\u0316\u0301\u035F", 7, ColElems{w(121), w(0, 220)}},
206 {"a\u0301\u0315\u035Fb", 7, ColElems{w(121), w(0, 232)}},
207 {"a\u0316\u0301\u0315\u035Db", 5, ColElems{w(102), w(0, 220)}},
208 {"a\u0316\u0301\u0315\u035F", 9, ColElems{w(121), w(0, 220), w(0, 232)}},
209 {"a\u0316\u0301\u0315\u035Fb", 9, ColElems{w(121), w(0, 220), w(0, 232)}},
210 {"a\u0316\u0301\u0315\u035F\u035D", 9, ColElems{w(121), w(0, 220), w(0, 232)}},
211 {"a\u0316\u0301\u0315\u035F\u035Db", 9, ColElems{w(121), w(0, 220), w(0, 232)}},
213 // handling of segment overflow
214 { // just fits within segment
215 "a" + string(modSeq[:30]) + "\u0301",
216 3 + len(string(modSeq[:30])),
217 append(ColElems{w(102)}, modW[:30]...),
219 {"a" + string(modSeq[:31]) + "\u0301", 1, ColElems{w(100)}}, // overflow
220 {"a" + string(modSeq) + "\u0301", 1, ColElems{w(100)}},
221 { // just fits within segment with two interstitial runes
222 "a" + string(modSeq[:28]) + "\u0301\u0315\u035F",
223 7 + len(string(modSeq[:28])),
224 append(append(ColElems{w(121)}, modW[:28]...), w(0, 232)),
226 { // second half does not fit within segment
227 "a" + string(modSeq[:29]) + "\u0301\u0315\u035F",
228 3 + len(string(modSeq[:29])),
229 append(ColElems{w(102)}, modW[:29]...),
232 // discontinuity can only occur in last normalization segment
233 {"a\u035Eb\u035E", 6, ColElems{w(115)}},
234 {"a\u0316\u035Eb\u035E", 5, ColElems{w(110), w(0, 220)}},
235 {"a\u035Db\u035D", 6, ColElems{w(117)}},
236 {"a\u0316\u035Db\u035D", 1, ColElems{w(100)}},
237 {"a\u035Eb\u0316\u035E", 8, ColElems{w(115), w(0, 220)}},
238 {"a\u035Db\u0316\u035D", 8, ColElems{w(117), w(0, 220)}},
239 {"ac\u035Eaca\u035E", 9, ColElems{w(116)}},
240 {"a\u0316c\u035Eaca\u035E", 1, ColElems{w(100)}},
241 {"ac\u035Eac\u0316a\u035E", 1, ColElems{w(100)}},
243 // expanding contraction
244 {"\u03B1\u0345", 4, ColElems{w(901), w(902)}},
246 // Theoretical possibilities
247 // contraction within a gap
248 {"a\u302F\u18A9\u0301", 9, ColElems{w(102), w(0, 130)}},
249 // expansion within a gap
250 {"a\u0317\u0301", 5, ColElems{w(102), w(0, 220), w(0, 220)}},
251 // repeating CCC blocks last modifier
252 {"a\u302E\u302F\u0301", 1, ColElems{w(100)}},
253 // The trailing combining characters (with lower CCC) should block the first one.
254 // TODO: make the following pass.
255 // {"a\u035E\u0316\u0316", 1, ColElems{w(100)}},
256 {"a\u035F\u035Eb", 5, ColElems{w(110), w(0, 233)}},
257 // Last combiner should match after normalization.
258 // TODO: make the following pass.
259 // {"a\u035D\u0301", 3, ColElems{w(102), w(0, 234)}},
260 // The first combiner is blocking the second one as they have the same CCC.
261 {"a\u035D\u035Eb", 1, ColElems{w(100)}},
266 func TestAppendNext(t *testing.T) {
267 for i, tt := range appendNextTests {
268 c, err := makeTable(tt.in)
270 t.Errorf("%d: error creating table: %v", i, err)
273 for j, chk := range tt.chk {
274 ws, n := c.t.AppendNext(nil, []byte(chk.in))
276 t.Errorf("%d:%d: bytes consumed was %d; want %d", i, j, n, chk.n)
278 out := convertFromWeights(chk.out)
279 if len(ws) != len(out) {
280 t.Errorf("%d:%d: len(ws) was %d; want %d (%X vs %X)\n%X", i, j, len(ws), len(out), ws, out, chk.in)
283 for k, w := range ws {
284 w, _ = colltab.MakeElem(w.Primary(), w.Secondary(), int(w.Tertiary()), 0)
286 t.Errorf("%d:%d: Weights %d was %X; want %X", i, j, k, w, out[k])