vendor/golang.org/x/text/language/match_test.go

   1 // Copyright 2013 The Go Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style
   3 // license that can be found in the LICENSE file.
   4
   5 package language
   6
   7 import (
   8         "bytes"
   9         "flag"
  10         "fmt"
  11         "os"
  12         "path"
  13         "path/filepath"
  14         "strings"
  15         "testing"
  16
  17         "golang.org/x/text/internal/testtext"
  18         "golang.org/x/text/internal/ucd"
  19 )
  20
  21 var verbose = flag.Bool("verbose", false, "set to true to print the internal tables of matchers")
  22
  23 func TestCompliance(t *testing.T) {
  24         filepath.Walk("testdata", func(file string, info os.FileInfo, err error) error {
  25                 if info.IsDir() {
  26                         return nil
  27                 }
  28                 r, err := os.Open(file)
  29                 if err != nil {
  30                         t.Fatal(err)
  31                 }
  32                 ucd.Parse(r, func(p *ucd.Parser) {
  33                         name := strings.Replace(path.Join(p.String(0), p.String(1)), " ", "", -1)
  34                         if skip[name] {
  35                                 return
  36                         }
  37                         t.Run(info.Name()+"/"+name, func(t *testing.T) {
  38                                 supported := makeTagList(p.String(0))
  39                                 desired := makeTagList(p.String(1))
  40                                 gotCombined, index, conf := NewMatcher(supported).Match(desired...)
  41
  42                                 gotMatch := supported[index]
  43                                 wantMatch := mk(p.String(2))
  44                                 if gotMatch != wantMatch {
  45                                         t.Fatalf("match: got %q; want %q (%v)", gotMatch, wantMatch, conf)
  46                                 }
  47                                 wantCombined, err := Raw.Parse(p.String(3))
  48                                 if err == nil && gotCombined != wantCombined {
  49                                         t.Errorf("combined: got %q; want %q (%v)", gotCombined, wantCombined, conf)
  50                                 }
  51                         })
  52                 })
  53                 return nil
  54         })
  55 }
  56
  57 var skip = map[string]bool{
  58         // TODO: bugs
  59         // Honor the wildcard match. This may only be useful to select non-exact
  60         // stuff.
  61         "mul,af/nl": true, // match: got "af"; want "mul"
  62
  63         // TODO: include other extensions.
  64         // combined: got "en-GB-u-ca-buddhist-nu-arab"; want "en-GB-fonipa-t-m0-iso-i0-pinyin-u-ca-buddhist-nu-arab"
  65         "und,en-GB-u-sd-gbsct/en-fonipa-u-nu-Arab-ca-buddhist-t-m0-iso-i0-pinyin": true,
  66
  67         // Inconsistencies with Mark Davis' implementation where it is not clear
  68         // which is better.
  69
  70         // Inconsistencies in combined. I think the Go approach is more appropriate.
  71         // We could use -u-rg- and -u-va- as alternative.
  72         "und,fr/fr-BE-fonipa":              true, // combined: got "fr"; want "fr-BE-fonipa"
  73         "und,fr-CA/fr-BE-fonipa":           true, // combined: got "fr-CA"; want "fr-BE-fonipa"
  74         "und,fr-fonupa/fr-BE-fonipa":       true, // combined: got "fr-fonupa"; want "fr-BE-fonipa"
  75         "und,no/nn-BE-fonipa":              true, // combined: got "no"; want "no-BE-fonipa"
  76         "50,und,fr-CA-fonupa/fr-BE-fonipa": true, // combined: got "fr-CA-fonupa"; want "fr-BE-fonipa"
  77
  78         // The initial number is a threshold. As we don't use scoring, we will not
  79         // implement this.
  80         "50,und,fr-Cyrl-CA-fonupa/fr-BE-fonipa": true,
  81         // match: got "und"; want "fr-Cyrl-CA-fonupa"
  82         // combined: got "und"; want "fr-Cyrl-BE-fonipa"
  83
  84         // Other interesting cases to test:
  85         // - Should same language or same script have the preference if there is
  86         //   usually no understanding of the other script?
  87         // - More specific region in desired may replace enclosing supported.
  88 }
  89
  90 func makeTagList(s string) (tags []Tag) {
  91         for _, s := range strings.Split(s, ",") {
  92                 tags = append(tags, mk(strings.TrimSpace(s)))
  93         }
  94         return tags
  95 }
  96
  97 func TestMatchStrings(t *testing.T) {
  98         testCases := []struct {
  99                 supported string
 100                 desired   string // strings separted by |
 101                 tag       string
 102                 index     int
 103         }{{
 104                 supported: "en",
 105                 desired:   "",
 106                 tag:       "en",
 107                 index:     0,
 108         }, {
 109                 supported: "en",
 110                 desired:   "nl",
 111                 tag:       "en",
 112                 index:     0,
 113         }, {
 114                 supported: "en,nl",
 115                 desired:   "nl",
 116                 tag:       "nl",
 117                 index:     1,
 118         }, {
 119                 supported: "en,nl",
 120                 desired:   "nl|en",
 121                 tag:       "nl",
 122                 index:     1,
 123         }, {
 124                 supported: "en-GB,nl",
 125                 desired:   "en ; q=0.1,nl",
 126                 tag:       "nl",
 127                 index:     1,
 128         }, {
 129                 supported: "en-GB,nl",
 130                 desired:   "en;q=0.005 | dk; q=0.1,nl ",
 131                 tag:       "en-GB",
 132                 index:     0,
 133         }, {
 134                 // do not match faulty tags with und
 135                 supported: "en,und",
 136                 desired:   "|en",
 137                 tag:       "en",
 138                 index:     0,
 139         }}
 140         for _, tc := range testCases {
 141                 t.Run(path.Join(tc.supported, tc.desired), func(t *testing.T) {
 142                         m := NewMatcher(makeTagList(tc.supported))
 143                         tag, index := MatchStrings(m, strings.Split(tc.desired, "|")...)
 144                         if tag.String() != tc.tag || index != tc.index {
 145                                 t.Errorf("got %v, %d; want %v, %d", tag, index, tc.tag, tc.index)
 146                         }
 147                 })
 148         }
 149 }
 150
 151 func TestAddLikelySubtags(t *testing.T) {
 152         tests := []struct{ in, out string }{
 153                 {"aa", "aa-Latn-ET"},
 154                 {"aa-Latn", "aa-Latn-ET"},
 155                 {"aa-Arab", "aa-Arab-ET"},
 156                 {"aa-Arab-ER", "aa-Arab-ER"},
 157                 {"kk", "kk-Cyrl-KZ"},
 158                 {"kk-CN", "kk-Arab-CN"},
 159                 {"cmn", "cmn"},
 160                 {"zh-AU", "zh-Hant-AU"},
 161                 {"zh-VN", "zh-Hant-VN"},
 162                 {"zh-SG", "zh-Hans-SG"},
 163                 {"zh-Hant", "zh-Hant-TW"},
 164                 {"zh-Hani", "zh-Hani-CN"},
 165                 {"und-Hani", "zh-Hani-CN"},
 166                 {"und", "en-Latn-US"},
 167                 {"und-GB", "en-Latn-GB"},
 168                 {"und-CW", "pap-Latn-CW"},
 169                 {"und-YT", "fr-Latn-YT"},
 170                 {"und-Arab", "ar-Arab-EG"},
 171                 {"und-AM", "hy-Armn-AM"},
 172                 {"und-TW", "zh-Hant-TW"},
 173                 {"und-002", "en-Latn-NG"},
 174                 {"und-Latn-002", "en-Latn-NG"},
 175                 {"en-Latn-002", "en-Latn-NG"},
 176                 {"en-002", "en-Latn-NG"},
 177                 {"en-001", "en-Latn-US"},
 178                 {"und-003", "en-Latn-US"},
 179                 {"und-GB", "en-Latn-GB"},
 180                 {"Latn-001", "en-Latn-US"},
 181                 {"en-001", "en-Latn-US"},
 182                 {"es-419", "es-Latn-419"},
 183                 {"he-145", "he-Hebr-IL"},
 184                 {"ky-145", "ky-Latn-TR"},
 185                 {"kk", "kk-Cyrl-KZ"},
 186                 // Don't specialize duplicate and ambiguous matches.
 187                 {"kk-034", "kk-Arab-034"}, // Matches IR and AF. Both are Arab.
 188                 {"ku-145", "ku-Latn-TR"},  // Matches IQ, TR, and LB, but kk -> TR.
 189                 {"und-Arab-CC", "ms-Arab-CC"},
 190                 {"und-Arab-GB", "ks-Arab-GB"},
 191                 {"und-Hans-CC", "zh-Hans-CC"},
 192                 {"und-CC", "en-Latn-CC"},
 193                 {"sr", "sr-Cyrl-RS"},
 194                 {"sr-151", "sr-Latn-151"}, // Matches RO and RU.
 195                 // We would like addLikelySubtags to generate the same results if the input
 196                 // only changes by adding tags that would otherwise have been added
 197                 // by the expansion.
 198                 // In other words:
 199                 //     und-AA -> xx-Scrp-AA   implies und-Scrp-AA -> xx-Scrp-AA
 200                 //     und-AA -> xx-Scrp-AA   implies xx-AA -> xx-Scrp-AA
 201                 //     und-Scrp -> xx-Scrp-AA implies und-Scrp-AA -> xx-Scrp-AA
 202                 //     und-Scrp -> xx-Scrp-AA implies xx-Scrp -> xx-Scrp-AA
 203                 //     xx -> xx-Scrp-AA       implies xx-Scrp -> xx-Scrp-AA
 204                 //     xx -> xx-Scrp-AA       implies xx-AA -> xx-Scrp-AA
 205                 //
 206                 // The algorithm specified in
 207                 //   http://unicode.org/reports/tr35/tr35-9.html#Supplemental_Data,
 208                 // Section C.10, does not handle the first case. For example,
 209                 // the CLDR data contains an entry und-BJ -> fr-Latn-BJ, but not
 210                 // there is no rule for und-Latn-BJ.  According to spec, und-Latn-BJ
 211                 // would expand to en-Latn-BJ, violating the aforementioned principle.
 212                 // We deviate from the spec by letting und-Scrp-AA expand to xx-Scrp-AA
 213                 // if a rule of the form und-AA -> xx-Scrp-AA is defined.
 214                 // Note that as of version 23, CLDR has some explicitly specified
 215                 // entries that do not conform to these rules. The implementation
 216                 // will not correct these explicit inconsistencies. A later versions of CLDR
 217                 // is supposed to fix this.
 218                 {"und-Latn-BJ", "fr-Latn-BJ"},
 219                 {"und-Bugi-ID", "bug-Bugi-ID"},
 220                 // regions, scripts and languages without definitions
 221                 {"und-Arab-AA", "ar-Arab-AA"},
 222                 {"und-Afak-RE", "fr-Afak-RE"},
 223                 {"und-Arab-GB", "ks-Arab-GB"},
 224                 {"abp-Arab-GB", "abp-Arab-GB"},
 225                 // script has preference over region
 226                 {"und-Arab-NL", "ar-Arab-NL"},
 227                 {"zza", "zza-Latn-TR"},
 228                 // preserve variants and extensions
 229                 {"de-1901", "de-Latn-DE-1901"},
 230                 {"de-x-abc", "de-Latn-DE-x-abc"},
 231                 {"de-1901-x-abc", "de-Latn-DE-1901-x-abc"},
 232                 {"x-abc", "x-abc"}, // TODO: is this the desired behavior?
 233         }
 234         for i, tt := range tests {
 235                 in, _ := Parse(tt.in)
 236                 out, _ := Parse(tt.out)
 237                 in, _ = in.addLikelySubtags()
 238                 if in.String() != out.String() {
 239                         t.Errorf("%d: add(%s) was %s; want %s", i, tt.in, in, tt.out)
 240                 }
 241         }
 242 }
 243 func TestMinimize(t *testing.T) {
 244         tests := []struct{ in, out string }{
 245                 {"aa", "aa"},
 246                 {"aa-Latn", "aa"},
 247                 {"aa-Latn-ET", "aa"},
 248                 {"aa-ET", "aa"},
 249                 {"aa-Arab", "aa-Arab"},
 250                 {"aa-Arab-ER", "aa-Arab-ER"},
 251                 {"aa-Arab-ET", "aa-Arab"},
 252                 {"und", "und"},
 253                 {"und-Latn", "und"},
 254                 {"und-Latn-US", "und"},
 255                 {"en-Latn-US", "en"},
 256                 {"cmn", "cmn"},
 257                 {"cmn-Hans", "cmn-Hans"},
 258                 {"cmn-Hant", "cmn-Hant"},
 259                 {"zh-AU", "zh-AU"},
 260                 {"zh-VN", "zh-VN"},
 261                 {"zh-SG", "zh-SG"},
 262                 {"zh-Hant", "zh-Hant"},
 263                 {"zh-Hant-TW", "zh-TW"},
 264                 {"zh-Hans", "zh"},
 265                 {"zh-Hani", "zh-Hani"},
 266                 {"und-Hans", "und-Hans"},
 267                 {"und-Hani", "und-Hani"},
 268
 269                 {"und-CW", "und-CW"},
 270                 {"und-YT", "und-YT"},
 271                 {"und-Arab", "und-Arab"},
 272                 {"und-AM", "und-AM"},
 273                 {"und-Arab-CC", "und-Arab-CC"},
 274                 {"und-CC", "und-CC"},
 275                 {"und-Latn-BJ", "und-BJ"},
 276                 {"und-Bugi-ID", "und-Bugi"},
 277                 {"bug-Bugi-ID", "bug-Bugi"},
 278                 // regions, scripts and languages without definitions
 279                 {"und-Arab-AA", "und-Arab-AA"},
 280                 // preserve variants and extensions
 281                 {"de-Latn-1901", "de-1901"},
 282                 {"de-Latn-x-abc", "de-x-abc"},
 283                 {"de-DE-1901-x-abc", "de-1901-x-abc"},
 284                 {"x-abc", "x-abc"}, // TODO: is this the desired behavior?
 285         }
 286         for i, tt := range tests {
 287                 in, _ := Parse(tt.in)
 288                 out, _ := Parse(tt.out)
 289                 min, _ := in.minimize()
 290                 if min.String() != out.String() {
 291                         t.Errorf("%d: min(%s) was %s; want %s", i, tt.in, min, tt.out)
 292                 }
 293                 max, _ := min.addLikelySubtags()
 294                 if x, _ := in.addLikelySubtags(); x.String() != max.String() {
 295                         t.Errorf("%d: max(min(%s)) = %s; want %s", i, tt.in, max, x)
 296                 }
 297         }
 298 }
 299
 300 func TestRegionGroups(t *testing.T) {
 301         testCases := []struct {
 302                 a, b     string
 303                 distance uint8
 304         }{
 305                 {"zh-TW", "zh-HK", 5},
 306                 {"zh-MO", "zh-HK", 4},
 307                 {"es-ES", "es-AR", 5},
 308                 {"es-ES", "es", 4},
 309                 {"es-419", "es-MX", 4},
 310                 {"es-AR", "es-MX", 4},
 311                 {"es-ES", "es-MX", 5},
 312                 {"es-PT", "es-MX", 5},
 313         }
 314         for _, tc := range testCases {
 315                 a := MustParse(tc.a)
 316                 aScript, _ := a.Script()
 317                 b := MustParse(tc.b)
 318                 bScript, _ := b.Script()
 319
 320                 if aScript != bScript {
 321                         t.Errorf("scripts differ: %q vs %q", aScript, bScript)
 322                         continue
 323                 }
 324                 d, _ := regionGroupDist(a.region, b.region, aScript.scriptID, a.lang)
 325                 if d != tc.distance {
 326                         t.Errorf("got %q; want %q", d, tc.distance)
 327                 }
 328         }
 329 }
 330
 331 func TestIsParadigmLocale(t *testing.T) {
 332         testCases := map[string]bool{
 333                 "en-US":  true,
 334                 "en-GB":  true,
 335                 "en-VI":  false,
 336                 "es-GB":  false,
 337                 "es-ES":  true,
 338                 "es-419": true,
 339         }
 340         for str, want := range testCases {
 341                 tag := Make(str)
 342                 got := isParadigmLocale(tag.lang, tag.region)
 343                 if got != want {
 344                         t.Errorf("isPL(%q) = %v; want %v", str, got, want)
 345                 }
 346         }
 347 }
 348
 349 // Implementation of String methods for various types for debugging purposes.
 350
 351 func (m *matcher) String() string {
 352         w := &bytes.Buffer{}
 353         fmt.Fprintln(w, "Default:", m.default_)
 354         for tag, h := range m.index {
 355                 fmt.Fprintf(w, "  %s: %v\n", tag, h)
 356         }
 357         return w.String()
 358 }
 359
 360 func (h *matchHeader) String() string {
 361         w := &bytes.Buffer{}
 362         fmt.Fprint(w, "haveTag: ")
 363         for _, h := range h.haveTags {
 364                 fmt.Fprintf(w, "%v, ", h)
 365         }
 366         return w.String()
 367 }
 368
 369 func (t haveTag) String() string {
 370         return fmt.Sprintf("%v:%d:%v:%v-%v|%v", t.tag, t.index, t.conf, t.maxRegion, t.maxScript, t.altScript)
 371 }
 372
 373 func TestBestMatchAlloc(t *testing.T) {
 374         m := NewMatcher(makeTagList("en sr nl"))
 375         // Go allocates when creating a list of tags from a single tag!
 376         list := []Tag{English}
 377         avg := testtext.AllocsPerRun(1, func() {
 378                 m.Match(list...)
 379         })
 380         if avg > 0 {
 381                 t.Errorf("got %f; want 0", avg)
 382         }
 383 }
 384
 385 var benchHave = []Tag{
 386         mk("en"),
 387         mk("en-GB"),
 388         mk("za"),
 389         mk("zh-Hant"),
 390         mk("zh-Hans-CN"),
 391         mk("zh"),
 392         mk("zh-HK"),
 393         mk("ar-MK"),
 394         mk("en-CA"),
 395         mk("fr-CA"),
 396         mk("fr-US"),
 397         mk("fr-CH"),
 398         mk("fr"),
 399         mk("lt"),
 400         mk("lv"),
 401         mk("iw"),
 402         mk("iw-NL"),
 403         mk("he"),
 404         mk("he-IT"),
 405         mk("tlh"),
 406         mk("ja"),
 407         mk("ja-Jpan"),
 408         mk("ja-Jpan-JP"),
 409         mk("de"),
 410         mk("de-CH"),
 411         mk("de-AT"),
 412         mk("de-DE"),
 413         mk("sr"),
 414         mk("sr-Latn"),
 415         mk("sr-Cyrl"),
 416         mk("sr-ME"),
 417 }
 418
 419 var benchWant = [][]Tag{
 420         []Tag{
 421                 mk("en"),
 422         },
 423         []Tag{
 424                 mk("en-AU"),
 425                 mk("de-HK"),
 426                 mk("nl"),
 427                 mk("fy"),
 428                 mk("lv"),
 429         },
 430         []Tag{
 431                 mk("en-AU"),
 432                 mk("de-HK"),
 433                 mk("nl"),
 434                 mk("fy"),
 435         },
 436         []Tag{
 437                 mk("ja-Hant"),
 438                 mk("da-HK"),
 439                 mk("nl"),
 440                 mk("zh-TW"),
 441         },
 442         []Tag{
 443                 mk("ja-Hant"),
 444                 mk("da-HK"),
 445                 mk("nl"),
 446                 mk("hr"),
 447         },
 448 }
 449
 450 func BenchmarkMatch(b *testing.B) {
 451         m := newMatcher(benchHave, nil)
 452         for i := 0; i < b.N; i++ {
 453                 for _, want := range benchWant {
 454                         m.getBest(want...)
 455                 }
 456         }
 457 }
 458
 459 func BenchmarkMatchExact(b *testing.B) {
 460         want := mk("en")
 461         m := newMatcher(benchHave, nil)
 462         for i := 0; i < b.N; i++ {
 463                 m.getBest(want)
 464         }
 465 }
 466
 467 func BenchmarkMatchAltLanguagePresent(b *testing.B) {
 468         want := mk("hr")
 469         m := newMatcher(benchHave, nil)
 470         for i := 0; i < b.N; i++ {
 471                 m.getBest(want)
 472         }
 473 }
 474
 475 func BenchmarkMatchAltLanguageNotPresent(b *testing.B) {
 476         want := mk("nn")
 477         m := newMatcher(benchHave, nil)
 478         for i := 0; i < b.N; i++ {
 479                 m.getBest(want)
 480         }
 481 }
 482
 483 func BenchmarkMatchAltScriptPresent(b *testing.B) {
 484         want := mk("zh-Hant-CN")
 485         m := newMatcher(benchHave, nil)
 486         for i := 0; i < b.N; i++ {
 487                 m.getBest(want)
 488         }
 489 }
 490
 491 func BenchmarkMatchAltScriptNotPresent(b *testing.B) {
 492         want := mk("fr-Cyrl")
 493         m := newMatcher(benchHave, nil)
 494         for i := 0; i < b.N; i++ {
 495                 m.getBest(want)
 496         }
 497 }
 498
 499 func BenchmarkMatchLimitedExact(b *testing.B) {
 500         want := []Tag{mk("he-NL"), mk("iw-NL")}
 501         m := newMatcher(benchHave, nil)
 502         for i := 0; i < b.N; i++ {
 503                 m.getBest(want...)
 504         }
 505 }