1 // Copyright 2016 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
14 "golang.org/x/text/internal/testtext"
15 "golang.org/x/text/language"
16 "golang.org/x/text/unicode/norm"
19 func TestICUConformance(t *testing.T) {
37 "a\u00ad\u00ada", // Format
39 "a''a", // SingleQuote
47 "a__a", // ExtendNumlet
51 add := func(x interface{}) {
52 switch v := x.(type) {
54 input = append(input, v)
57 input = append(input, s)
61 for _, tc := range testCases {
67 for _, tc := range bufferTests {
70 for _, tc := range breakTest {
71 add(strings.Replace(tc, "|", "", -1))
73 for _, tc := range foldTestCases {
78 for _, c := range []string{"lower", "upper", "title", "fold"} {
79 for _, tag := range []string{
80 "und", "af", "az", "el", "lt", "nl", "tr",
82 for _, s := range input {
83 if exclude(c, tag, s) {
86 testtext.Run(t, path.Join(c, tag, s), func(t *testing.T) {
87 want := doICU(tag, c, s)
88 got := doGo(tag, c, s)
89 if norm.NFC.String(got) != norm.NFC.String(want) {
90 t.Errorf("\n in %[3]q (%+[3]q)\n got %[1]q (%+[1]q)\n want %[2]q (%+[2]q)", got, want, s)
98 // exclude indicates if a string should be excluded from testing.
99 func exclude(cm, tag, s string) bool {
100 list := []struct{ cm, tags, pattern string }{
101 // TODO: Go does not handle certain esoteric breaks correctly. This will be
102 // fixed once we have a real word break iterator. Alternatively, it
103 // seems like we're not too far off from making it work, so we could
104 // fix these last steps. But first verify that using a separate word
105 // breaker does not hurt performance.
106 {"title", "af nl", "a''a"},
109 // All the exclusions below seem to be issues with the ICU
110 // implementation (at version 57) and thus are not marked as TODO.
112 // ICU does not handle leading apostrophe for Dutch and
113 // Afrikaans correctly. See http://unicode.org/cldr/trac/ticket/7078.
114 {"title", "af nl", "'n"},
115 {"title", "af nl", "'N"},
117 // Go terminates the final sigma check after a fixed number of
118 // ignorables have been found. This ensures that the algorithm can make
119 // progress in a streaming scenario.
120 {"lower title", "", "\u039f\u03a3...............................a"},
121 // This also applies to upper in Greek.
122 // NOTE: we could fix the following two cases by adding state to elUpper
123 // and aztrLower. However, considering a modifier to not belong to the
124 // preceding letter after the maximum modifiers count is reached is
125 // consistent with the behavior of unicode/norm.
126 {"upper", "el", "\u03bf" + strings.Repeat("\u0321", 29) + "\u0313"},
127 {"lower", "az tr lt", "I" + strings.Repeat("\u0321", 30) + "\u0307\u0300"},
128 {"upper", "lt", "i" + strings.Repeat("\u0321", 30) + "\u0307\u0300"},
129 {"lower", "lt", "I" + strings.Repeat("\u0321", 30) + "\u0300"},
131 // ICU title case seems to erroneously removes \u0307 from an upper case
132 // I unconditionally, instead of only when lowercasing. The ICU
133 // transform algorithm transforms these cases consistently with our
135 {"title", "az tr", "\u0307"},
137 // The spec says to remove \u0307 after Soft-Dotted characters. ICU
138 // transforms conform but ucasemap_utf8ToUpper does not.
139 {"upper title", "lt", "i\u0307"},
140 {"upper title", "lt", "i" + strings.Repeat("\u0321", 29) + "\u0307\u0300"},
142 // Both Unicode and CLDR prescribe an extra explicit dot above after a
143 // Soft_Dotted character if there are other modifiers.
144 // ucasemap_utf8ToUpper does not do this; ICU transforms do.
145 // The issue with ucasemap_utf8ToUpper seems to be that it does not
146 // consider the modifiers that are part of composition in the evaluation
147 // of More_Above. For instance, according to the More_Above rule for lt,
148 // a dotted capital I (U+0130) becomes i\u0307\u0307 (an small i with
149 // two additional dots). This seems odd, but is correct. ICU is
150 // definitely not correct as it produces different results for different
151 // normal forms. For instance, for an İ:
152 // \u0130 (NFC) -> i\u0307 (incorrect)
153 // I\u0307 (NFD) -> i\u0307\u0307 (correct)
154 // We could argue that we should not add a \u0307 if there already is
155 // one, but this may be hard to get correct and is not conform the
157 {"lower title", "lt", "\u0130"},
158 {"lower title", "lt", "\u00cf"},
160 // We are conform ICU ucasemap_utf8ToUpper if we remove support for
161 // elUpper. However, this is clearly not conform the spec. Moreover, the
162 // ICU transforms _do_ implement this transform and produces results
163 // consistent with our implementation. Note that we still prefer to use
164 // ucasemap_utf8ToUpper instead of transforms as the latter have
165 // inconsistencies in the word breaking algorithm.
166 {"upper", "el", "\u0386"}, // GREEK CAPITAL LETTER ALPHA WITH TONOS
167 {"upper", "el", "\u0389"}, // GREEK CAPITAL LETTER ETA WITH TONOS
168 {"upper", "el", "\u038A"}, // GREEK CAPITAL LETTER IOTA WITH TONOS
170 {"upper", "el", "\u0391"}, // GREEK CAPITAL LETTER ALPHA
171 {"upper", "el", "\u0397"}, // GREEK CAPITAL LETTER ETA
172 {"upper", "el", "\u0399"}, // GREEK CAPITAL LETTER IOTA
174 {"upper", "el", "\u03AC"}, // GREEK SMALL LETTER ALPHA WITH TONOS
175 {"upper", "el", "\u03AE"}, // GREEK SMALL LETTER ALPHA WITH ETA
176 {"upper", "el", "\u03AF"}, // GREEK SMALL LETTER ALPHA WITH IOTA
178 {"upper", "el", "\u03B1"}, // GREEK SMALL LETTER ALPHA
179 {"upper", "el", "\u03B7"}, // GREEK SMALL LETTER ETA
180 {"upper", "el", "\u03B9"}, // GREEK SMALL LETTER IOTA
182 for _, x := range list {
183 if x.cm != "" && strings.Index(x.cm, cm) == -1 {
186 if x.tags != "" && strings.Index(x.tags, tag) == -1 {
189 if strings.Index(s, x.pattern) != -1 {
196 func doGo(tag, caser, input string) string {
198 t := language.MustParse(tag)
209 return c.String(input)