1 // Copyright 2012 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
22 "golang.org/x/text/collate/build"
23 "golang.org/x/text/internal/gen"
24 "golang.org/x/text/language"
27 var long = flag.Bool("long", false,
28 "run time-consuming tests, such as tests that fetch data online")
30 // This regression test runs tests for the test files in CollationTest.zip
31 // (taken from http://www.unicode.org/Public/UCA/<gen.UnicodeVersion()>/).
33 // The test files have the following form:
35 // 0009 0021; # ('\u0009') <CHARACTER TABULATION> [| | | 0201 025E]
36 // 0009 003F; # ('\u0009') <CHARACTER TABULATION> [| | | 0201 0263]
37 // 000A 0021; # ('\u000A') <LINE FEED (LF)> [| | | 0202 025E]
38 // 000A 003F; # ('\u000A') <LINE FEED (LF)> [| | | 0202 0263]
40 // The part before the semicolon is the hex representation of a sequence
41 // of runes. After the hash mark is a comment. The strings
42 // represented by rune sequence are in the file in sorted order, as
43 // defined by the DUCET.
51 var versionRe = regexp.MustCompile(`# UCA Version: (.*)\n?$`)
52 var testRe = regexp.MustCompile(`^([\dA-F ]+);.*# (.*)\n?$`)
54 func TestCollation(t *testing.T) {
55 if !gen.IsLocal() && !*long {
56 t.Skip("skipping test to prevent downloading; to run use -long or use -local to specify a local source")
58 t.Skip("must first update to new file format to support test")
59 for _, test := range loadTestData() {
70 // parseUCA parses a Default Unicode Collation Element Table of the format
71 // specified in http://www.unicode.org/reports/tr10/#File_Format.
72 // It returns the variable top.
73 func parseUCA(builder *build.Builder) {
74 r := gen.OpenUnicodeFile("UCA", "", "allkeys.txt")
76 input := bufio.NewReader(r)
77 colelem := regexp.MustCompile(`\[([.*])([0-9A-F.]+)\]`)
78 for i := 1; true; i++ {
79 l, prefix, err := input.ReadLine()
86 log.Fatalf("%d: buffer overflow", i)
88 if len(line) == 0 || line[0] == '#' {
92 if strings.HasPrefix(line[1:], "version ") {
93 if v := strings.Split(line[1:], " ")[1]; v != gen.UnicodeVersion() {
94 log.Fatalf("incompatible version %s; want %s", v, gen.UnicodeVersion())
99 part := strings.Split(line, " ; ")
101 log.Fatalf("%d: production rule without ';': %v", i, line)
104 for _, v := range strings.Split(part[0], " ") {
106 lhs = append(lhs, rune(convHex(i, v)))
111 for i, m := range colelem.FindAllStringSubmatch(part[1], -1) {
113 vars = append(vars, i)
116 for _, h := range strings.Split(m[2], ".") {
117 elem = append(elem, convHex(i, h))
119 rhs = append(rhs, elem)
121 builder.Add(lhs, rhs, vars)
126 func convHex(line int, s string) int {
127 r, e := strconv.ParseInt(s, 16, 32)
129 log.Fatalf("%d: %v", line, e)
134 func loadTestData() []Test {
135 f := gen.OpenUnicodeFile("UCA", "", "CollationTest.zip")
136 buffer, err := ioutil.ReadAll(f)
139 archive, err := zip.NewReader(bytes.NewReader(buffer), int64(len(buffer)))
142 for _, f := range archive.File {
143 // Skip the short versions, which are simply duplicates of the long versions.
144 if strings.Contains(f.Name, "SHORT") || f.FileInfo().IsDir() {
150 scanner := bufio.NewScanner(ff)
151 test := Test{name: path.Base(f.Name)}
153 line := scanner.Text()
154 if len(line) <= 1 || line[0] == '#' {
155 if m := versionRe.FindStringSubmatch(line); m != nil {
156 if m[1] != gen.UnicodeVersion() {
157 log.Printf("warning:%s: version is %s; want %s", f.Name, m[1], gen.UnicodeVersion())
162 m := testRe.FindStringSubmatch(line)
163 if m == nil || len(m) < 3 {
164 log.Fatalf(`Failed to parse: "%s" result: %#v`, line, m)
167 // In the regression test data (unpaired) surrogates are assigned a weight
168 // corresponding to their code point value. However, utf8.DecodeRune,
169 // which is used to compute the implicit weight, assigns FFFD to surrogates.
170 // We therefore skip tests with surrogates. This skips about 35 entries
173 for _, split := range strings.Split(m[1], " ") {
174 r, err := strconv.ParseUint(split, 16, 64)
176 valid = valid && utf8.ValidRune(rune(r))
177 str = append(str, string(rune(r))...)
180 test.str = append(test.str, str)
181 test.comment = append(test.comment, m[2])
184 if scanner.Err() != nil {
185 log.Fatal(scanner.Err())
187 tests = append(tests, test)
194 func runes(b []byte) []rune {
195 return []rune(string(b))
198 var shifted = language.MustParse("und-u-ka-shifted-ks-level4")
200 func doTest(t *testing.T, tc Test) {
201 bld := build.NewBuilder()
203 w, err := bld.Build()
206 if !strings.Contains(tc.name, "NON_IGNOR") {
209 c := NewFromTable(w, OptionsFromTag(tag))
212 for i := 1; i < len(tc.str); i++ {
217 if r := bytes.Compare(ka, kb); r == 1 {
218 t.Errorf("%s:%d: Key(%.4X) < Key(%.4X) (%X < %X) == %d; want -1 or 0", tc.name, i, []rune(string(prev)), []rune(string(s)), ka, kb, r)
222 if r := c.Compare(prev, s); r == 1 {
223 t.Errorf("%s:%d: Compare(%.4X, %.4X) == %d; want -1 or 0", tc.name, i, runes(prev), runes(s), r)
225 if r := c.Compare(s, prev); r == -1 {
226 t.Errorf("%s:%d: Compare(%.4X, %.4X) == %d; want 1 or 0", tc.name, i, runes(s), runes(prev), r)