vendor/golang.org/x/text/cases/icu_test.go

   1 // Copyright 2016 The Go Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style
   3 // license that can be found in the LICENSE file.
   4
   5 // +build icu
   6
   7 package cases
   8
   9 import (
  10         "path"
  11         "strings"
  12         "testing"
  13
  14         "golang.org/x/text/internal/testtext"
  15         "golang.org/x/text/language"
  16         "golang.org/x/text/unicode/norm"
  17 )
  18
  19 func TestICUConformance(t *testing.T) {
  20         // Build test set.
  21         input := []string{
  22                 "a.a a_a",
  23                 "a\u05d0a",
  24                 "\u05d0'a",
  25                 "a\u03084a",
  26                 "a\u0308a",
  27                 "a3\u30a3a",
  28                 "a\u303aa",
  29                 "a_\u303a_a",
  30                 "1_a..a",
  31                 "1_a.a",
  32                 "a..a.",
  33                 "a--a-",
  34                 "a-a-",
  35                 "a\u200ba",
  36                 "a\u200b\u200ba",
  37                 "a\u00ad\u00ada", // Format
  38                 "a\u00ada",
  39                 "a''a", // SingleQuote
  40                 "a'a",
  41                 "a::a", // MidLetter
  42                 "a:a",
  43                 "a..a", // MidNumLet
  44                 "a.a",
  45                 "a;;a", // MidNum
  46                 "a;a",
  47                 "a__a", // ExtendNumlet
  48                 "a_a",
  49                 "ΟΣ''a",
  50         }
  51         add := func(x interface{}) {
  52                 switch v := x.(type) {
  53                 case string:
  54                         input = append(input, v)
  55                 case []string:
  56                         for _, s := range v {
  57                                 input = append(input, s)
  58                         }
  59                 }
  60         }
  61         for _, tc := range testCases {
  62                 add(tc.src)
  63                 add(tc.lower)
  64                 add(tc.upper)
  65                 add(tc.title)
  66         }
  67         for _, tc := range bufferTests {
  68                 add(tc.src)
  69         }
  70         for _, tc := range breakTest {
  71                 add(strings.Replace(tc, "|", "", -1))
  72         }
  73         for _, tc := range foldTestCases {
  74                 add(tc)
  75         }
  76
  77         // Compare ICU to Go.
  78         for _, c := range []string{"lower", "upper", "title", "fold"} {
  79                 for _, tag := range []string{
  80                         "und", "af", "az", "el", "lt", "nl", "tr",
  81                 } {
  82                         for _, s := range input {
  83                                 if exclude(c, tag, s) {
  84                                         continue
  85                                 }
  86                                 testtext.Run(t, path.Join(c, tag, s), func(t *testing.T) {
  87                                         want := doICU(tag, c, s)
  88                                         got := doGo(tag, c, s)
  89                                         if norm.NFC.String(got) != norm.NFC.String(want) {
  90                                                 t.Errorf("\n    in %[3]q (%+[3]q)\n   got %[1]q (%+[1]q)\n  want %[2]q (%+[2]q)", got, want, s)
  91                                         }
  92                                 })
  93                         }
  94                 }
  95         }
  96 }
  97
  98 // exclude indicates if a string should be excluded from testing.
  99 func exclude(cm, tag, s string) bool {
 100         list := []struct{ cm, tags, pattern string }{
 101                 // TODO: Go does not handle certain esoteric breaks correctly. This will be
 102                 // fixed once we have a real word break iterator. Alternatively, it
 103                 // seems like we're not too far off from making it work, so we could
 104                 // fix these last steps. But first verify that using a separate word
 105                 // breaker does not hurt performance.
 106                 {"title", "af nl", "a''a"},
 107                 {"", "", "א'a"},
 108
 109                 // All the exclusions below seem to be issues with the ICU
 110                 // implementation (at version 57) and thus are not marked as TODO.
 111
 112                 // ICU does not handle leading apostrophe for Dutch and
 113                 // Afrikaans correctly. See http://unicode.org/cldr/trac/ticket/7078.
 114                 {"title", "af nl", "'n"},
 115                 {"title", "af nl", "'N"},
 116
 117                 // Go terminates the final sigma check after a fixed number of
 118                 // ignorables have been found. This ensures that the algorithm can make
 119                 // progress in a streaming scenario.
 120                 {"lower title", "", "\u039f\u03a3...............................a"},
 121                 // This also applies to upper in Greek.
 122                 // NOTE: we could fix the following two cases by adding state to elUpper
 123                 // and aztrLower. However, considering a modifier to not belong to the
 124                 // preceding letter after the maximum modifiers count is reached is
 125                 // consistent with the behavior of unicode/norm.
 126                 {"upper", "el", "\u03bf" + strings.Repeat("\u0321", 29) + "\u0313"},
 127                 {"lower", "az tr lt", "I" + strings.Repeat("\u0321", 30) + "\u0307\u0300"},
 128                 {"upper", "lt", "i" + strings.Repeat("\u0321", 30) + "\u0307\u0300"},
 129                 {"lower", "lt", "I" + strings.Repeat("\u0321", 30) + "\u0300"},
 130
 131                 // ICU title case seems to erroneously removes \u0307 from an upper case
 132                 // I unconditionally, instead of only when lowercasing. The ICU
 133                 // transform algorithm transforms these cases consistently with our
 134                 // implementation.
 135                 {"title", "az tr", "\u0307"},
 136
 137                 // The spec says to remove \u0307 after Soft-Dotted characters. ICU
 138                 // transforms conform but ucasemap_utf8ToUpper does not.
 139                 {"upper title", "lt", "i\u0307"},
 140                 {"upper title", "lt", "i" + strings.Repeat("\u0321", 29) + "\u0307\u0300"},
 141
 142                 // Both Unicode and CLDR prescribe an extra explicit dot above after a
 143                 // Soft_Dotted character if there are other modifiers.
 144                 // ucasemap_utf8ToUpper does not do this; ICU transforms do.
 145                 // The issue with ucasemap_utf8ToUpper seems to be that it does not
 146                 // consider the modifiers that are part of composition in the evaluation
 147                 // of More_Above. For instance, according to the More_Above rule for lt,
 148                 // a dotted capital I (U+0130) becomes i\u0307\u0307 (an small i with
 149                 // two additional dots). This seems odd, but is correct. ICU is
 150                 // definitely not correct as it produces different results for different
 151                 // normal forms. For instance, for an İ:
 152                 //    \u0130  (NFC) -> i\u0307         (incorrect)
 153                 //    I\u0307 (NFD) -> i\u0307\u0307   (correct)
 154                 // We could argue that we should not add a \u0307 if there already is
 155                 // one, but this may be hard to get correct and is not conform the
 156                 // standard.
 157                 {"lower title", "lt", "\u0130"},
 158                 {"lower title", "lt", "\u00cf"},
 159
 160                 // We are conform ICU ucasemap_utf8ToUpper if we remove support for
 161                 // elUpper. However, this is clearly not conform the spec. Moreover, the
 162                 // ICU transforms _do_ implement this transform and produces results
 163                 // consistent with our implementation. Note that we still prefer to use
 164                 // ucasemap_utf8ToUpper instead of transforms as the latter have
 165                 // inconsistencies in the word breaking algorithm.
 166                 {"upper", "el", "\u0386"}, // GREEK CAPITAL LETTER ALPHA WITH TONOS
 167                 {"upper", "el", "\u0389"}, // GREEK CAPITAL LETTER ETA WITH TONOS
 168                 {"upper", "el", "\u038A"}, // GREEK CAPITAL LETTER IOTA WITH TONOS
 169
 170                 {"upper", "el", "\u0391"}, // GREEK CAPITAL LETTER ALPHA
 171                 {"upper", "el", "\u0397"}, // GREEK CAPITAL LETTER ETA
 172                 {"upper", "el", "\u0399"}, // GREEK CAPITAL LETTER IOTA
 173
 174                 {"upper", "el", "\u03AC"}, // GREEK SMALL LETTER ALPHA WITH TONOS
 175                 {"upper", "el", "\u03AE"}, // GREEK SMALL LETTER ALPHA WITH ETA
 176                 {"upper", "el", "\u03AF"}, // GREEK SMALL LETTER ALPHA WITH IOTA
 177
 178                 {"upper", "el", "\u03B1"}, // GREEK SMALL LETTER ALPHA
 179                 {"upper", "el", "\u03B7"}, // GREEK SMALL LETTER ETA
 180                 {"upper", "el", "\u03B9"}, // GREEK SMALL LETTER IOTA
 181         }
 182         for _, x := range list {
 183                 if x.cm != "" && strings.Index(x.cm, cm) == -1 {
 184                         continue
 185                 }
 186                 if x.tags != "" && strings.Index(x.tags, tag) == -1 {
 187                         continue
 188                 }
 189                 if strings.Index(s, x.pattern) != -1 {
 190                         return true
 191                 }
 192         }
 193         return false
 194 }
 195
 196 func doGo(tag, caser, input string) string {
 197         var c Caser
 198         t := language.MustParse(tag)
 199         switch caser {
 200         case "lower":
 201                 c = Lower(t)
 202         case "upper":
 203                 c = Upper(t)
 204         case "title":
 205                 c = Title(t)
 206         case "fold":
 207                 c = Fold()
 208         }
 209         return c.String(input)
 210 }