1 // Copyright 2015 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
12 "golang.org/x/text/language"
15 func TestCompile(t *testing.T) {
16 for i, tc := range []struct {
30 desc: "keep modifier",
31 pattern: "a\u0300", // U+0300: COMBINING GRAVE ACCENT
34 desc: "remove modifier",
35 pattern: "a\u0300", // U+0300: COMBINING GRAVE ACCENT
36 options: []Option{IgnoreDiacritics},
39 desc: "single with double collation element",
43 desc: "leading variable",
47 desc: "trailing variable",
51 desc: "leading and trailing variable",
55 desc: "keep interior variable",
59 desc: "keep interior variables",
63 desc: "remove ignoreables (zero-weights across the board)",
64 pattern: "\u009Db\u009Dä\u009D", // U+009D: OPERATING SYSTEM COMMAND
67 m := New(language.Und, tc.options...)
68 p := m.CompileString(tc.pattern)
69 if len(p.ce) != tc.n {
70 t.Errorf("%d:%s: Compile(%+q): got %d; want %d", i, tc.desc, tc.pattern, len(p.ce), tc.n)
75 func TestNorm(t *testing.T) {
76 // U+0300: COMBINING GRAVE ACCENT (CCC=230)
77 // U+031B: COMBINING HORN (CCC=216)
78 for _, tc := range []struct {
82 want bool // a and b compile into the same pattern?
89 "large number of modifiers in pattern",
90 strings.Repeat("\u0300", 29) + "\u0318",
91 "\u0318" + strings.Repeat("\u0300", 29),
94 "modifier overflow in pattern",
95 strings.Repeat("\u0300", 30) + "\u0318",
96 "\u0318" + strings.Repeat("\u0300", 30),
99 m := New(language.Und)
100 a := m.CompileString(tc.a)
101 b := m.CompileString(tc.b)
102 if got := reflect.DeepEqual(a, b); got != tc.want {
103 t.Errorf("Compile(a) == Compile(b) == %v; want %v", got, tc.want)
108 func TestForwardSearch(t *testing.T) {
109 for i, tc := range []struct {
117 // The semantics of an empty search is to match nothing.
118 // TODO: change this to be in line with strings.Index? It is quite a
119 // different beast, so not sure yet.
121 desc: "empty pattern and text",
125 want: nil, // TODO: consider: []int{0, 0},
127 desc: "non-empty pattern and empty text",
133 desc: "empty pattern and non-empty text",
137 want: nil, // TODO: consider: []int{0, 0, 1, 1, 2, 2, 3, 3},
139 // Variable-only patterns. We don't support variables at the moment,
140 // but verify that, given this, the behavior is indeed as expected.
142 desc: "exact match of variable",
148 desc: "variables not handled by default",
152 want: nil, // Would be (1, 2) for a median match with variable}.
154 desc: "multiple subsequent identical variables",
158 want: []int{0, 1, 1, 2, 2, 3, 3, 4},
160 desc: "text with variables",
162 options: []Option{IgnoreDiacritics},
167 desc: "pattern with interior variables",
169 options: []Option{IgnoreDiacritics},
171 text: "3 a b c abc a b c 3",
172 want: []int{2, 7}, // Would have 3 matches using variable.
174 // TODO: Different variable handling settings.
178 desc: "match all levels",
181 text: "abcAbcABCÁbcábc",
184 desc: "ignore diacritics in text",
186 options: []Option{IgnoreDiacritics},
191 desc: "ignore diacritics in pattern",
193 options: []Option{IgnoreDiacritics},
198 desc: "ignore diacritics",
200 options: []Option{IgnoreDiacritics},
202 text: "abcAbcABCÁbcábc",
203 want: []int{3, 6, 9, 13},
207 options: []Option{IgnoreCase},
209 text: "abcAbcABCÁbcábc",
210 want: []int{0, 3, 3, 6, 6, 9},
212 desc: "ignore case and diacritics",
214 options: []Option{IgnoreCase, IgnoreDiacritics},
216 text: "abcAbcABCÁbcábc",
217 want: []int{0, 3, 3, 6, 6, 9, 9, 13, 13, 17},
219 desc: "ignore width to fullwidth",
221 options: []Option{IgnoreWidth},
223 text: "123 \uFF41\uFF42\uFF43 123", // U+FF41-3: FULLWIDTH LATIN SMALL LETTER A-C
226 // TODO: distinguish between case and width.
227 desc: "don't ignore width to fullwidth, ignoring only case",
229 options: []Option{IgnoreCase},
231 text: "123 \uFF41\uFF42\uFF43 123", // U+FF41-3: FULLWIDTH LATIN SMALL LETTER A-C
234 desc: "ignore width to fullwidth and diacritics",
236 options: []Option{IgnoreWidth, IgnoreDiacritics},
238 text: "123 \uFF41\uFF42\uFF43 123", // U+FF41-3: FULLWIDTH LATIN SMALL LETTER A-C
241 desc: "whole grapheme, single rune",
247 // Note: rules on when to apply contractions may, for certain languages,
248 // differ between search and collation. For example, "ch" is not
249 // considered a contraction for the purpose of searching in Spanish.
250 // Therefore, be careful picking this test.
251 desc: "whole grapheme, contractions",
254 // Fails at the primary level, because "aa" is a contraction.
255 text: "123 abaa 123",
258 desc: "whole grapheme, trailing modifier",
261 text: "123 eee\u0300 123", // U+0300: COMBINING GRAVE ACCENT
264 // Language-specific matching.
268 options: []Option{IgnoreCase},
270 text: "AarhusÅrhus Århus ",
271 want: []int{0, 6, 6, 12, 14, 20},
275 options: []Option{IgnoreCase},
277 text: "Århus Aarhus",
278 want: []int{0, 6, 7, 13},
281 tag: "en", // Å does not match A for English.
282 options: []Option{IgnoreCase},
287 desc: "ignore modifier in text",
288 options: []Option{IgnoreDiacritics},
291 text: "123 eee\u0300 123", // U+0300: COMBINING GRAVE ACCENT
292 want: []int{4, 9}, // Matches on grapheme boundary.
294 desc: "ignore multiple modifiers in text",
295 options: []Option{IgnoreDiacritics},
298 text: "123 eee\u0300\u0300 123", // U+0300: COMBINING GRAVE ACCENT
299 want: []int{4, 11}, // Matches on grapheme boundary.
301 desc: "ignore modifier in pattern",
302 options: []Option{IgnoreDiacritics},
304 pattern: "eee\u0300", // U+0300: COMBINING GRAVE ACCENT
308 desc: "ignore multiple modifiers in pattern",
309 options: []Option{IgnoreDiacritics},
311 pattern: "eee\u0300\u0300", // U+0300: COMBINING GRAVE ACCENT
315 desc: "match non-normalized pattern",
317 // U+0300: COMBINING GRAVE ACCENT (CCC=230)
318 // U+031B: COMBINING HORN (CCC=216)
319 pattern: "eee\u0300\u031b",
320 text: "123 eee\u031b\u0300 123",
323 desc: "match non-normalized text",
325 // U+0300: COMBINING GRAVE ACCENT (CCC=230)
326 // U+031B: COMBINING HORN (CCC=216)
327 pattern: "eee\u031b\u0300",
328 text: "123 eee\u0300\u031b 123",
331 m := New(language.MustParse(tc.tag), tc.options...)
332 p := m.CompileString(tc.pattern)
333 for j := 0; j < len(tc.text); {
334 start, end := p.IndexString(tc.text[j:])
335 if start == -1 && end == -1 {
342 if len(tc.want) == 0 {
343 t.Errorf("%d:%s: found unexpected result [%d %d]", i, tc.desc, start, end)
346 if tc.want[0] != start || tc.want[1] != end {
347 t.Errorf("%d:%s: got [%d %d]; want %v", i, tc.desc, start, end, tc.want[:2])
348 tc.want = tc.want[2:]
351 tc.want = tc.want[2:]
353 if len(tc.want) != 0 {
354 t.Errorf("%d:%s: %d extra results", i, tc.desc, len(tc.want)/2)