1 // Copyright 2013 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
17 "golang.org/x/text/internal/testtext"
18 "golang.org/x/text/internal/ucd"
21 var verbose = flag.Bool("verbose", false, "set to true to print the internal tables of matchers")
23 func TestCompliance(t *testing.T) {
24 filepath.Walk("testdata", func(file string, info os.FileInfo, err error) error {
28 r, err := os.Open(file)
32 ucd.Parse(r, func(p *ucd.Parser) {
33 name := strings.Replace(path.Join(p.String(0), p.String(1)), " ", "", -1)
37 t.Run(info.Name()+"/"+name, func(t *testing.T) {
38 supported := makeTagList(p.String(0))
39 desired := makeTagList(p.String(1))
40 gotCombined, index, conf := NewMatcher(supported).Match(desired...)
42 gotMatch := supported[index]
43 wantMatch := mk(p.String(2))
44 if gotMatch != wantMatch {
45 t.Fatalf("match: got %q; want %q (%v)", gotMatch, wantMatch, conf)
47 wantCombined, err := Raw.Parse(p.String(3))
48 if err == nil && gotCombined != wantCombined {
49 t.Errorf("combined: got %q; want %q (%v)", gotCombined, wantCombined, conf)
57 var skip = map[string]bool{
59 // Honor the wildcard match. This may only be useful to select non-exact
61 "mul,af/nl": true, // match: got "af"; want "mul"
63 // TODO: include other extensions.
64 // combined: got "en-GB-u-ca-buddhist-nu-arab"; want "en-GB-fonipa-t-m0-iso-i0-pinyin-u-ca-buddhist-nu-arab"
65 "und,en-GB-u-sd-gbsct/en-fonipa-u-nu-Arab-ca-buddhist-t-m0-iso-i0-pinyin": true,
67 // Inconsistencies with Mark Davis' implementation where it is not clear
70 // Inconsistencies in combined. I think the Go approach is more appropriate.
71 // We could use -u-rg- and -u-va- as alternative.
72 "und,fr/fr-BE-fonipa": true, // combined: got "fr"; want "fr-BE-fonipa"
73 "und,fr-CA/fr-BE-fonipa": true, // combined: got "fr-CA"; want "fr-BE-fonipa"
74 "und,fr-fonupa/fr-BE-fonipa": true, // combined: got "fr-fonupa"; want "fr-BE-fonipa"
75 "und,no/nn-BE-fonipa": true, // combined: got "no"; want "no-BE-fonipa"
76 "50,und,fr-CA-fonupa/fr-BE-fonipa": true, // combined: got "fr-CA-fonupa"; want "fr-BE-fonipa"
78 // The initial number is a threshold. As we don't use scoring, we will not
80 "50,und,fr-Cyrl-CA-fonupa/fr-BE-fonipa": true,
81 // match: got "und"; want "fr-Cyrl-CA-fonupa"
82 // combined: got "und"; want "fr-Cyrl-BE-fonipa"
84 // Other interesting cases to test:
85 // - Should same language or same script have the preference if there is
86 // usually no understanding of the other script?
87 // - More specific region in desired may replace enclosing supported.
90 func makeTagList(s string) (tags []Tag) {
91 for _, s := range strings.Split(s, ",") {
92 tags = append(tags, mk(strings.TrimSpace(s)))
97 func TestMatchStrings(t *testing.T) {
98 testCases := []struct {
100 desired string // strings separted by |
124 supported: "en-GB,nl",
125 desired: "en ; q=0.1,nl",
129 supported: "en-GB,nl",
130 desired: "en;q=0.005 | dk; q=0.1,nl ",
134 // do not match faulty tags with und
140 for _, tc := range testCases {
141 t.Run(path.Join(tc.supported, tc.desired), func(t *testing.T) {
142 m := NewMatcher(makeTagList(tc.supported))
143 tag, index := MatchStrings(m, strings.Split(tc.desired, "|")...)
144 if tag.String() != tc.tag || index != tc.index {
145 t.Errorf("got %v, %d; want %v, %d", tag, index, tc.tag, tc.index)
151 func TestAddLikelySubtags(t *testing.T) {
152 tests := []struct{ in, out string }{
153 {"aa", "aa-Latn-ET"},
154 {"aa-Latn", "aa-Latn-ET"},
155 {"aa-Arab", "aa-Arab-ET"},
156 {"aa-Arab-ER", "aa-Arab-ER"},
157 {"kk", "kk-Cyrl-KZ"},
158 {"kk-CN", "kk-Arab-CN"},
160 {"zh-AU", "zh-Hant-AU"},
161 {"zh-VN", "zh-Hant-VN"},
162 {"zh-SG", "zh-Hans-SG"},
163 {"zh-Hant", "zh-Hant-TW"},
164 {"zh-Hani", "zh-Hani-CN"},
165 {"und-Hani", "zh-Hani-CN"},
166 {"und", "en-Latn-US"},
167 {"und-GB", "en-Latn-GB"},
168 {"und-CW", "pap-Latn-CW"},
169 {"und-YT", "fr-Latn-YT"},
170 {"und-Arab", "ar-Arab-EG"},
171 {"und-AM", "hy-Armn-AM"},
172 {"und-TW", "zh-Hant-TW"},
173 {"und-002", "en-Latn-NG"},
174 {"und-Latn-002", "en-Latn-NG"},
175 {"en-Latn-002", "en-Latn-NG"},
176 {"en-002", "en-Latn-NG"},
177 {"en-001", "en-Latn-US"},
178 {"und-003", "en-Latn-US"},
179 {"und-GB", "en-Latn-GB"},
180 {"Latn-001", "en-Latn-US"},
181 {"en-001", "en-Latn-US"},
182 {"es-419", "es-Latn-419"},
183 {"he-145", "he-Hebr-IL"},
184 {"ky-145", "ky-Latn-TR"},
185 {"kk", "kk-Cyrl-KZ"},
186 // Don't specialize duplicate and ambiguous matches.
187 {"kk-034", "kk-Arab-034"}, // Matches IR and AF. Both are Arab.
188 {"ku-145", "ku-Latn-TR"}, // Matches IQ, TR, and LB, but kk -> TR.
189 {"und-Arab-CC", "ms-Arab-CC"},
190 {"und-Arab-GB", "ks-Arab-GB"},
191 {"und-Hans-CC", "zh-Hans-CC"},
192 {"und-CC", "en-Latn-CC"},
193 {"sr", "sr-Cyrl-RS"},
194 {"sr-151", "sr-Latn-151"}, // Matches RO and RU.
195 // We would like addLikelySubtags to generate the same results if the input
196 // only changes by adding tags that would otherwise have been added
199 // und-AA -> xx-Scrp-AA implies und-Scrp-AA -> xx-Scrp-AA
200 // und-AA -> xx-Scrp-AA implies xx-AA -> xx-Scrp-AA
201 // und-Scrp -> xx-Scrp-AA implies und-Scrp-AA -> xx-Scrp-AA
202 // und-Scrp -> xx-Scrp-AA implies xx-Scrp -> xx-Scrp-AA
203 // xx -> xx-Scrp-AA implies xx-Scrp -> xx-Scrp-AA
204 // xx -> xx-Scrp-AA implies xx-AA -> xx-Scrp-AA
206 // The algorithm specified in
207 // http://unicode.org/reports/tr35/tr35-9.html#Supplemental_Data,
208 // Section C.10, does not handle the first case. For example,
209 // the CLDR data contains an entry und-BJ -> fr-Latn-BJ, but not
210 // there is no rule for und-Latn-BJ. According to spec, und-Latn-BJ
211 // would expand to en-Latn-BJ, violating the aforementioned principle.
212 // We deviate from the spec by letting und-Scrp-AA expand to xx-Scrp-AA
213 // if a rule of the form und-AA -> xx-Scrp-AA is defined.
214 // Note that as of version 23, CLDR has some explicitly specified
215 // entries that do not conform to these rules. The implementation
216 // will not correct these explicit inconsistencies. A later versions of CLDR
217 // is supposed to fix this.
218 {"und-Latn-BJ", "fr-Latn-BJ"},
219 {"und-Bugi-ID", "bug-Bugi-ID"},
220 // regions, scripts and languages without definitions
221 {"und-Arab-AA", "ar-Arab-AA"},
222 {"und-Afak-RE", "fr-Afak-RE"},
223 {"und-Arab-GB", "ks-Arab-GB"},
224 {"abp-Arab-GB", "abp-Arab-GB"},
225 // script has preference over region
226 {"und-Arab-NL", "ar-Arab-NL"},
227 {"zza", "zza-Latn-TR"},
228 // preserve variants and extensions
229 {"de-1901", "de-Latn-DE-1901"},
230 {"de-x-abc", "de-Latn-DE-x-abc"},
231 {"de-1901-x-abc", "de-Latn-DE-1901-x-abc"},
232 {"x-abc", "x-abc"}, // TODO: is this the desired behavior?
234 for i, tt := range tests {
235 in, _ := Parse(tt.in)
236 out, _ := Parse(tt.out)
237 in, _ = in.addLikelySubtags()
238 if in.String() != out.String() {
239 t.Errorf("%d: add(%s) was %s; want %s", i, tt.in, in, tt.out)
243 func TestMinimize(t *testing.T) {
244 tests := []struct{ in, out string }{
247 {"aa-Latn-ET", "aa"},
249 {"aa-Arab", "aa-Arab"},
250 {"aa-Arab-ER", "aa-Arab-ER"},
251 {"aa-Arab-ET", "aa-Arab"},
254 {"und-Latn-US", "und"},
255 {"en-Latn-US", "en"},
257 {"cmn-Hans", "cmn-Hans"},
258 {"cmn-Hant", "cmn-Hant"},
262 {"zh-Hant", "zh-Hant"},
263 {"zh-Hant-TW", "zh-TW"},
265 {"zh-Hani", "zh-Hani"},
266 {"und-Hans", "und-Hans"},
267 {"und-Hani", "und-Hani"},
269 {"und-CW", "und-CW"},
270 {"und-YT", "und-YT"},
271 {"und-Arab", "und-Arab"},
272 {"und-AM", "und-AM"},
273 {"und-Arab-CC", "und-Arab-CC"},
274 {"und-CC", "und-CC"},
275 {"und-Latn-BJ", "und-BJ"},
276 {"und-Bugi-ID", "und-Bugi"},
277 {"bug-Bugi-ID", "bug-Bugi"},
278 // regions, scripts and languages without definitions
279 {"und-Arab-AA", "und-Arab-AA"},
280 // preserve variants and extensions
281 {"de-Latn-1901", "de-1901"},
282 {"de-Latn-x-abc", "de-x-abc"},
283 {"de-DE-1901-x-abc", "de-1901-x-abc"},
284 {"x-abc", "x-abc"}, // TODO: is this the desired behavior?
286 for i, tt := range tests {
287 in, _ := Parse(tt.in)
288 out, _ := Parse(tt.out)
289 min, _ := in.minimize()
290 if min.String() != out.String() {
291 t.Errorf("%d: min(%s) was %s; want %s", i, tt.in, min, tt.out)
293 max, _ := min.addLikelySubtags()
294 if x, _ := in.addLikelySubtags(); x.String() != max.String() {
295 t.Errorf("%d: max(min(%s)) = %s; want %s", i, tt.in, max, x)
300 func TestRegionGroups(t *testing.T) {
301 testCases := []struct {
305 {"zh-TW", "zh-HK", 5},
306 {"zh-MO", "zh-HK", 4},
307 {"es-ES", "es-AR", 5},
309 {"es-419", "es-MX", 4},
310 {"es-AR", "es-MX", 4},
311 {"es-ES", "es-MX", 5},
312 {"es-PT", "es-MX", 5},
314 for _, tc := range testCases {
316 aScript, _ := a.Script()
318 bScript, _ := b.Script()
320 if aScript != bScript {
321 t.Errorf("scripts differ: %q vs %q", aScript, bScript)
324 d, _ := regionGroupDist(a.region, b.region, aScript.scriptID, a.lang)
325 if d != tc.distance {
326 t.Errorf("got %q; want %q", d, tc.distance)
331 func TestIsParadigmLocale(t *testing.T) {
332 testCases := map[string]bool{
340 for str, want := range testCases {
342 got := isParadigmLocale(tag.lang, tag.region)
344 t.Errorf("isPL(%q) = %v; want %v", str, got, want)
349 // Implementation of String methods for various types for debugging purposes.
351 func (m *matcher) String() string {
353 fmt.Fprintln(w, "Default:", m.default_)
354 for tag, h := range m.index {
355 fmt.Fprintf(w, " %s: %v\n", tag, h)
360 func (h *matchHeader) String() string {
362 fmt.Fprint(w, "haveTag: ")
363 for _, h := range h.haveTags {
364 fmt.Fprintf(w, "%v, ", h)
369 func (t haveTag) String() string {
370 return fmt.Sprintf("%v:%d:%v:%v-%v|%v", t.tag, t.index, t.conf, t.maxRegion, t.maxScript, t.altScript)
373 func TestBestMatchAlloc(t *testing.T) {
374 m := NewMatcher(makeTagList("en sr nl"))
375 // Go allocates when creating a list of tags from a single tag!
376 list := []Tag{English}
377 avg := testtext.AllocsPerRun(1, func() {
381 t.Errorf("got %f; want 0", avg)
385 var benchHave = []Tag{
419 var benchWant = [][]Tag{
450 func BenchmarkMatch(b *testing.B) {
451 m := newMatcher(benchHave, nil)
452 for i := 0; i < b.N; i++ {
453 for _, want := range benchWant {
459 func BenchmarkMatchExact(b *testing.B) {
461 m := newMatcher(benchHave, nil)
462 for i := 0; i < b.N; i++ {
467 func BenchmarkMatchAltLanguagePresent(b *testing.B) {
469 m := newMatcher(benchHave, nil)
470 for i := 0; i < b.N; i++ {
475 func BenchmarkMatchAltLanguageNotPresent(b *testing.B) {
477 m := newMatcher(benchHave, nil)
478 for i := 0; i < b.N; i++ {
483 func BenchmarkMatchAltScriptPresent(b *testing.B) {
484 want := mk("zh-Hant-CN")
485 m := newMatcher(benchHave, nil)
486 for i := 0; i < b.N; i++ {
491 func BenchmarkMatchAltScriptNotPresent(b *testing.B) {
492 want := mk("fr-Cyrl")
493 m := newMatcher(benchHave, nil)
494 for i := 0; i < b.N; i++ {
499 func BenchmarkMatchLimitedExact(b *testing.B) {
500 want := []Tag{mk("he-NL"), mk("iw-NL")}
501 m := newMatcher(benchHave, nil)
502 for i := 0; i < b.N; i++ {