1 // Copyright 2015 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
13 "golang.org/x/text/internal/testtext"
14 "golang.org/x/text/secure/bidirule"
15 "golang.org/x/text/transform"
18 type testCase struct {
24 var enforceTestCases = []struct {
29 {"Basic", NewFreeform(), []testCase{
30 {"e\u0301\u031f", "\u00e9\u031f", nil}, // normalize
33 {"Context Rule 1", NewFreeform(), []testCase{
34 // Rule 1: zero-width non-joiner (U+200C)
37 // If Canonical_Combining_Class(Before(cp)) .eq. Virama Then True;
38 // If RegExpMatch((Joining_Type:{L,D})(Joining_Type:T)*\u200C
39 // (Joining_Type:T)*(Joining_Type:{R,D})) Then True;
41 // Example runes for different joining types:
42 // Join L: U+A872; PHAGS-PA SUPERFIXED LETTER RA
43 // Join D: U+062C; HAH WITH DOT BELOW
44 // Join T: U+0610; ARABIC SIGN SALLALLAHOU ALAYHE WASSALLAM
45 // Join R: U+0627; ALEF
46 // Virama: U+0A4D; GURMUKHI SIGN VIRAMA
47 // Virama and Join T: U+0ACD; GUJARATI SIGN VIRAMA
48 {"\u200c", "", errContext},
49 {"\u200ca", "", errContext},
50 {"a\u200c", "", errContext},
51 {"\u200c\u0627", "", errContext}, // missing JoinStart
52 {"\u062c\u200c", "", errContext}, // missing JoinEnd
53 {"\u0610\u200c\u0610\u0627", "", errContext}, // missing JoinStart
54 {"\u062c\u0610\u200c\u0610", "", errContext}, // missing JoinEnd
56 // Variants of: D T* U+200c T* R
57 {"\u062c\u200c\u0627", "\u062c\u200c\u0627", nil},
58 {"\u062c\u0610\u200c\u0610\u0627", "\u062c\u0610\u200c\u0610\u0627", nil},
59 {"\u062c\u0610\u0610\u200c\u0610\u0610\u0627", "\u062c\u0610\u0610\u200c\u0610\u0610\u0627", nil},
60 {"\u062c\u0610\u200c\u0627", "\u062c\u0610\u200c\u0627", nil},
61 {"\u062c\u200c\u0610\u0627", "\u062c\u200c\u0610\u0627", nil},
63 // Variants of: L T* U+200c T* D
64 {"\ua872\u200c\u062c", "\ua872\u200c\u062c", nil},
65 {"\ua872\u0610\u200c\u0610\u062c", "\ua872\u0610\u200c\u0610\u062c", nil},
66 {"\ua872\u0610\u0610\u200c\u0610\u0610\u062c", "\ua872\u0610\u0610\u200c\u0610\u0610\u062c", nil},
67 {"\ua872\u0610\u200c\u062c", "\ua872\u0610\u200c\u062c", nil},
68 {"\ua872\u200c\u0610\u062c", "\ua872\u200c\u0610\u062c", nil},
71 {"\u0a4d\u200c", "\u0a4d\u200c", nil},
72 {"\ua872\u0a4d\u200c", "\ua872\u0a4d\u200c", nil},
73 {"\ua872\u0a4d\u0610\u200c", "", errContext},
74 {"\ua872\u0a4d\u0610\u200c", "", errContext},
76 {"\u0acd\u200c", "\u0acd\u200c", nil},
77 {"\ua872\u0acd\u200c", "\ua872\u0acd\u200c", nil},
78 {"\ua872\u0acd\u0610\u200c", "", errContext},
79 {"\ua872\u0acd\u0610\u200c", "", errContext},
81 // Using Virama as join T
82 {"\ua872\u0acd\u200c\u062c", "\ua872\u0acd\u200c\u062c", nil},
83 {"\ua872\u200c\u0acd\u062c", "\ua872\u200c\u0acd\u062c", nil},
86 {"Context Rule 2", NewFreeform(), []testCase{
87 // Rule 2: zero-width joiner (U+200D)
88 {"\u200d", "", errContext},
89 {"\u200da", "", errContext},
90 {"a\u200d", "", errContext},
92 {"\u0a4d\u200d", "\u0a4d\u200d", nil},
93 {"\ua872\u0a4d\u200d", "\ua872\u0a4d\u200d", nil},
94 {"\u0a4da\u200d", "", errContext},
97 {"Context Rule 3", NewFreeform(), []testCase{
99 {"·", "", errContext},
100 {"l·", "", errContext},
101 {"·l", "", errContext},
102 {"a·", "", errContext},
103 {"l·a", "", errContext},
104 {"a·a", "", errContext},
106 {"al·la", "al·la", nil},
109 {"Context Rule 4", NewFreeform(), []testCase{
110 // Rule 4: Greek lower numeral U+0375
111 {"͵", "", errContext},
112 {"͵a", "", errContext},
113 {"α͵", "", errContext},
116 {"͵͵α", "͵͵α", nil}, // The numeric sign is itself Greek.
117 {"α͵͵α", "α͵͵α", nil},
118 {"α͵͵", "", errContext},
119 {"α͵͵a", "", errContext},
122 {"Context Rule 5+6", NewFreeform(), []testCase{
123 // Rule 5+6: Hebrew preceding
125 {"׳", "", errContext},
126 {"׳ה", "", errContext},
127 {"a׳b", "", errContext},
128 {"ש׳", "ש׳", nil}, // U+05e9 U+05f3
129 {"ש׳׳׳", "ש׳׳׳", nil}, // U+05e9 U+05f3
132 {"״", "", errContext},
133 {"״ה", "", errContext},
134 {"a״b", "", errContext},
135 {"ש״", "ש״", nil}, // U+05e9 U+05f4
136 {"ש״״״", "ש״״״", nil}, // U+05e9 U+05f4
137 {"aש״״״", "aש״״״", nil}, // U+05e9 U+05f4
140 {"Context Rule 7", NewFreeform(), []testCase{
141 // Rule 7: Katakana middle Dot
142 {"・", "", errContext},
143 {"abc・", "", errContext},
144 {"・def", "", errContext},
145 {"abc・def", "", errContext},
146 {"aヅc・def", "aヅc・def", nil},
147 {"abc・dぶf", "abc・dぶf", nil},
148 {"⺐bc・def", "⺐bc・def", nil},
151 {"Context Rule 8+9", NewFreeform(), []testCase{
152 // Rule 8+9: Arabic Indic Digit
153 {"١٢٣٤٥۶", "", errContext},
154 {"۱۲۳۴۵٦", "", errContext},
155 {"١٢٣٤٥", "١٢٣٤٥", nil},
156 {"۱۲۳۴۵", "۱۲۳۴۵", nil},
159 {"Nickname", Nickname, []testCase{
160 {" Swan of Avon ", "Swan of Avon", nil},
161 {"", "", errEmptyString},
162 {" ", "", errEmptyString},
163 {" ", "", errEmptyString},
164 {"a\u00A0a\u1680a\u2000a\u2001a\u2002a\u2003a\u2004a\u2005a\u2006a\u2007a\u2008a\u2009a\u200Aa\u202Fa\u205Fa\u3000a", "a a a a a a a a a a a a a a a a a", nil},
167 {"Foo Bar", "Foo Bar", nil},
168 {"foo bar", "foo bar", nil},
169 {"\u03A3", "\u03A3", nil},
170 {"\u03C3", "\u03C3", nil},
171 // Greek final sigma is left as is (do not fold!)
172 {"\u03C2", "\u03C2", nil},
173 {"\u265A", "♚", nil},
174 {"Richard \u2163", "Richard IV", nil},
175 {"\u212B", "Å", nil},
176 {"\uFB00", "ff", nil}, // because of NFKC
177 {"שa", "שa", nil}, // no bidi rule
178 {"동일조건변경허락", "동일조건변경허락", nil},
180 {"OpaqueString", OpaqueString, []testCase{
181 {" Swan of Avon ", " Swan of Avon ", nil},
182 {"", "", errEmptyString},
185 {"a\u00A0a\u1680a\u2000a\u2001a\u2002a\u2003a\u2004a\u2005a\u2006a\u2007a\u2008a\u2009a\u200Aa\u202Fa\u205Fa\u3000a", "a a a a a a a a a a a a a a a a a", nil},
188 {"Foo Bar", "Foo Bar", nil},
189 {"foo bar", "foo bar", nil},
190 {"\u03C3", "\u03C3", nil},
191 {"Richard \u2163", "Richard \u2163", nil},
192 {"\u212B", "Å", nil},
193 {"Jack of \u2666s", "Jack of \u2666s", nil},
194 {"my cat is a \u0009by", "", errDisallowedRune},
195 {"שa", "שa", nil}, // no bidi rule
197 {"UsernameCaseMapped", UsernameCaseMapped, []testCase{
198 // TODO: Should this work?
199 // {UsernameCaseMapped, "", "", errDisallowedRune},
200 {"juliet@example.com", "juliet@example.com", nil},
201 {"fussball", "fussball", nil},
202 {"fu\u00DFball", "fu\u00DFball", nil},
203 {"\u03C0", "\u03C0", nil},
204 {"\u03A3", "\u03C3", nil},
205 {"\u03C3", "\u03C3", nil},
206 // Greek final sigma is left as is (do not fold!)
207 {"\u03C2", "\u03C2", nil},
208 {"\u0049", "\u0069", nil},
209 {"\u0049", "\u0069", nil},
210 {"\u03D2", "", errDisallowedRune},
211 {"\u03B0", "\u03B0", nil},
212 {"foo bar", "", errDisallowedRune},
213 {"♚", "", errDisallowedRune},
214 {"\u007E", "~", nil},
217 {"²", "", errDisallowedRune},
218 {"\t", "", errDisallowedRune},
219 {"\n", "", errDisallowedRune},
220 {"\u26D6", "", errDisallowedRune},
221 {"\u26FF", "", errDisallowedRune},
222 {"\uFB00", "", errDisallowedRune},
223 {"\u1680", "", errDisallowedRune},
224 {" ", "", errDisallowedRune},
225 {" ", "", errDisallowedRune},
226 {"\u01C5", "", errDisallowedRune},
227 {"\u16EE", "", errDisallowedRune}, // Nl RUNIC ARLAUG SYMBOL
228 {"\u0488", "", errDisallowedRune}, // Me COMBINING CYRILLIC HUNDRED THOUSANDS SIGN
229 {"\u212B", "\u00e5", nil}, // Angstrom sign, NFC -> U+00E5
230 {"A\u030A", "å", nil}, // A + ring
231 {"\u00C5", "å", nil}, // A with ring
232 {"\u00E7", "ç", nil}, // c cedille
233 {"\u0063\u0327", "ç", nil}, // c + cedille
234 {"\u0158", "ř", nil},
235 {"\u0052\u030C", "ř", nil},
237 {"\u1E61", "\u1E61", nil}, // LATIN SMALL LETTER S WITH DOT ABOVE
239 // Confusable characters ARE allowed and should NOT be mapped.
240 {"\u0410", "\u0430", nil}, // CYRILLIC CAPITAL LETTER A
242 // Full width should be mapped to the canonical decomposition.
244 {"שc", "", bidirule.ErrInvalid}, // bidi rule
247 {"UsernameCasePreserved", UsernameCasePreserved, []testCase{
250 {"שc", "", bidirule.ErrInvalid}, // bidi rule
251 {"\uFB00", "", errDisallowedRune},
252 {"\u212B", "\u00c5", nil}, // Angstrom sign, NFC -> U+00E5
253 {"ẛ", "", errDisallowedRune}, // LATIN SMALL LETTER LONG S WITH DOT ABOVE
257 func doTests(t *testing.T, fn func(t *testing.T, p *Profile, tc testCase)) {
258 for _, g := range enforceTestCases {
259 for i, tc := range g.cases {
260 name := fmt.Sprintf("%s:%d:%+q", g.name, i, tc.input)
261 testtext.Run(t, name, func(t *testing.T) {
268 func TestString(t *testing.T) {
269 doTests(t, func(t *testing.T, p *Profile, tc testCase) {
270 if e, err := p.String(tc.input); tc.err != err || e != tc.output {
271 t.Errorf("got %+q (err: %v); want %+q (err: %v)", e, err, tc.output, tc.err)
276 func TestBytes(t *testing.T) {
277 doTests(t, func(t *testing.T, p *Profile, tc testCase) {
278 if e, err := p.Bytes([]byte(tc.input)); tc.err != err || string(e) != tc.output {
279 t.Errorf("got %+q (err: %v); want %+q (err: %v)", string(e), err, tc.output, tc.err)
282 // Test that calling Bytes with something that doesn't transform returns a
284 orig := []byte("hello")
285 b, _ := NewFreeform().Bytes(orig)
286 if reflect.ValueOf(b).Pointer() == reflect.ValueOf(orig).Pointer() {
287 t.Error("original and result are the same slice; should be a copy")
291 func TestAppend(t *testing.T) {
292 doTests(t, func(t *testing.T, p *Profile, tc testCase) {
293 if e, err := p.Append(nil, []byte(tc.input)); tc.err != err || string(e) != tc.output {
294 t.Errorf("got %+q (err: %v); want %+q (err: %v)", string(e), err, tc.output, tc.err)
299 func TestStringMallocs(t *testing.T) {
300 if n := testtext.AllocsPerRun(100, func() { UsernameCaseMapped.String("helloworld") }); n > 0 {
301 // TODO: reduce this to 0.
302 t.Skipf("got %f allocs, want 0", n)
306 func TestAppendMallocs(t *testing.T) {
307 str := []byte("helloworld")
308 out := make([]byte, 0, len(str))
309 if n := testtext.AllocsPerRun(100, func() { UsernameCaseMapped.Append(out, str) }); n > 0 {
310 t.Errorf("got %f allocs, want 0", n)
314 func TestTransformMallocs(t *testing.T) {
315 str := []byte("helloworld")
316 out := make([]byte, 0, len(str))
317 tr := UsernameCaseMapped.NewTransformer()
318 if n := testtext.AllocsPerRun(100, func() {
320 tr.Transform(out, str, true)
322 t.Errorf("got %f allocs, want 0", n)
326 func min(a, b int) int {
333 // TestTransformerShortBuffers tests that the precis.Transformer implements the
334 // spirit, not just the letter (the method signatures), of the
335 // transform.Transformer interface.
337 // In particular, it tests that, if one or both of the dst or src buffers are
338 // short, so that multiple Transform calls are required to complete the overall
339 // transformation, the end result is identical to one Transform call with
340 // sufficiently long buffers.
341 func TestTransformerShortBuffers(t *testing.T) {
342 srcUnit := []byte("a\u0300cce\u0301nts") // NFD normalization form.
343 wantUnit := []byte("àccénts") // NFC normalization form.
344 src := bytes.Repeat(srcUnit, 16)
345 want := bytes.Repeat(wantUnit, 16)
347 dst := make([]byte, long)
349 // 5, 7, 9, 11, 13, 16 and 17 are all pair-wise co-prime, which means that
350 // slicing the dst and src buffers into 5, 7, 13 and 17 byte chunks will
351 // fall at different places inside the repeated srcUnit's and wantUnit's.
352 if len(srcUnit) != 11 || len(wantUnit) != 9 || len(src) > long || len(want) > long {
353 t.Fatal("inconsistent lengths")
356 tr := NewFreeform().NewTransformer()
357 for _, deltaD := range []int{5, 7, 13, 17, long} {
359 for _, deltaS := range []int{5, 7, 13, 17, long} {
364 d1 := min(len(dst), d0+deltaD)
365 s1 := min(len(src), s0+deltaS)
366 nDst, nSrc, err := tr.Transform(dst[d0:d1:d1], src[s0:s1:s1], s1 == len(src))
372 if err == transform.ErrShortDst || err == transform.ErrShortSrc {
375 t.Errorf("deltaD=%d, deltaS=%d: %v", deltaD, deltaS, err)
379 t.Errorf("deltaD=%d, deltaS=%d: s0: got %d, want %d", deltaD, deltaS, s0, len(src))
383 t.Errorf("deltaD=%d, deltaS=%d: d0: got %d, want %d", deltaD, deltaS, d0, len(want))
387 if !bytes.Equal(got, want) {
388 t.Errorf("deltaD=%d, deltaS=%d:\ngot %q\nwant %q", deltaD, deltaS, got, want)