vendor/golang.org/x/text/unicode/norm/normalize_test.go

   1 // Copyright 2011 The Go Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style
   3 // license that can be found in the LICENSE file.
   4
   5 package norm
   6
   7 import (
   8         "bytes"
   9         "flag"
  10         "fmt"
  11         "io"
  12         "log"
  13         "strings"
  14         "testing"
  15         "unicode/utf8"
  16
  17         "golang.org/x/text/internal/testtext"
  18         "golang.org/x/text/transform"
  19 )
  20
  21 var (
  22         testn = flag.Int("testn", -1, "specific test number to run or -1 for all")
  23 )
  24
  25 // pc replaces any rune r that is repeated n times, for n > 1, with r{n}.
  26 func pc(s string) []byte {
  27         b := bytes.NewBuffer(make([]byte, 0, len(s)))
  28         for i := 0; i < len(s); {
  29                 r, sz := utf8.DecodeRuneInString(s[i:])
  30                 n := 0
  31                 if sz == 1 {
  32                         // Special-case one-byte case to handle repetition for invalid UTF-8.
  33                         for c := s[i]; i+n < len(s) && s[i+n] == c; n++ {
  34                         }
  35                 } else {
  36                         for _, r2 := range s[i:] {
  37                                 if r2 != r {
  38                                         break
  39                                 }
  40                                 n++
  41                         }
  42                 }
  43                 b.WriteString(s[i : i+sz])
  44                 if n > 1 {
  45                         fmt.Fprintf(b, "{%d}", n)
  46                 }
  47                 i += sz * n
  48         }
  49         return b.Bytes()
  50 }
  51
  52 // pidx finds the index from which two strings start to differ, plus context.
  53 // It returns the index and ellipsis if the index is greater than 0.
  54 func pidx(a, b string) (i int, prefix string) {
  55         for ; i < len(a) && i < len(b) && a[i] == b[i]; i++ {
  56         }
  57         if i < 8 {
  58                 return 0, ""
  59         }
  60         i -= 3 // ensure taking at least one full rune before the difference.
  61         for k := i - 7; i > k && !utf8.RuneStart(a[i]); i-- {
  62         }
  63         return i, "..."
  64 }
  65
  66 type PositionTest struct {
  67         input  string
  68         pos    int
  69         buffer string // expected contents of reorderBuffer, if applicable
  70 }
  71
  72 type positionFunc func(rb *reorderBuffer, s string) (int, []byte)
  73
  74 func runPosTests(t *testing.T, name string, f Form, fn positionFunc, tests []PositionTest) {
  75         rb := reorderBuffer{}
  76         rb.init(f, nil)
  77         for i, test := range tests {
  78                 rb.reset()
  79                 rb.src = inputString(test.input)
  80                 rb.nsrc = len(test.input)
  81                 pos, out := fn(&rb, test.input)
  82                 if pos != test.pos {
  83                         t.Errorf("%s:%d: position is %d; want %d", name, i, pos, test.pos)
  84                 }
  85                 if outs := string(out); outs != test.buffer {
  86                         k, pfx := pidx(outs, test.buffer)
  87                         t.Errorf("%s:%d: buffer \nwas  %s%+q; \nwant %s%+q", name, i, pfx, pc(outs[k:]), pfx, pc(test.buffer[k:]))
  88                 }
  89         }
  90 }
  91
  92 func grave(n int) string {
  93         return rep(0x0300, n)
  94 }
  95
  96 func rep(r rune, n int) string {
  97         return strings.Repeat(string(r), n)
  98 }
  99
 100 const segSize = maxByteBufferSize
 101
 102 var cgj = GraphemeJoiner
 103
 104 var decomposeSegmentTests = []PositionTest{
 105         // illegal runes
 106         {"\xC2", 0, ""},
 107         {"\xC0", 1, "\xC0"},
 108         {"\u00E0\x80", 2, "\u0061\u0300"},
 109         // starter
 110         {"a", 1, "a"},
 111         {"ab", 1, "a"},
 112         // starter + composing
 113         {"a\u0300", 3, "a\u0300"},
 114         {"a\u0300b", 3, "a\u0300"},
 115         // with decomposition
 116         {"\u00C0", 2, "A\u0300"},
 117         {"\u00C0b", 2, "A\u0300"},
 118         // long
 119         {grave(31), 60, grave(30) + cgj},
 120         {"a" + grave(31), 61, "a" + grave(30) + cgj},
 121
 122         // Stability tests: see http://www.unicode.org/review/pr-29.html.
 123         // U+0300 COMBINING GRAVE ACCENT;Mn;230;NSM;;;;;N;NON-SPACING GRAVE;;;;
 124         // U+0B47 ORIYA VOWEL SIGN E;Mc;0;L;;;;;N;;;;;
 125         // U+0B3E ORIYA VOWEL SIGN AA;Mc;0;L;;;;;N;;;;;
 126         // U+1100 HANGUL CHOSEONG KIYEOK;Lo;0;L;;;;;N;;;;;
 127         // U+1161 HANGUL JUNGSEONG A;Lo;0;L;;;;;N;;;;;
 128         {"\u0B47\u0300\u0B3E", 8, "\u0B47\u0300\u0B3E"},
 129         {"\u1100\u0300\u1161", 8, "\u1100\u0300\u1161"},
 130         {"\u0B47\u0B3E", 6, "\u0B47\u0B3E"},
 131         {"\u1100\u1161", 6, "\u1100\u1161"},
 132
 133         // U+04DA MALAYALAM VOWEL SIGN O;Mc;0;L;0D46 0D3E;;;;N;;;;;
 134         // Sequence of decomposing characters that are starters and modifiers.
 135         {"\u0d4a" + strings.Repeat("\u0d3e", 31), 90, "\u0d46" + strings.Repeat("\u0d3e", 30) + cgj},
 136
 137         {grave(30), 60, grave(30)},
 138         // U+FF9E is a starter, but decomposes to U+3099, which is not.
 139         {grave(30) + "\uff9e", 60, grave(30) + cgj},
 140         // ends with incomplete UTF-8 encoding
 141         {"\xCC", 0, ""},
 142         {"\u0300\xCC", 2, "\u0300"},
 143 }
 144
 145 func decomposeSegmentF(rb *reorderBuffer, s string) (int, []byte) {
 146         rb.initString(NFD, s)
 147         rb.setFlusher(nil, appendFlush)
 148         p := decomposeSegment(rb, 0, true)
 149         return p, rb.out
 150 }
 151
 152 func TestDecomposeSegment(t *testing.T) {
 153         runPosTests(t, "TestDecomposeSegment", NFC, decomposeSegmentF, decomposeSegmentTests)
 154 }
 155
 156 var firstBoundaryTests = []PositionTest{
 157         // no boundary
 158         {"", -1, ""},
 159         {"\u0300", -1, ""},
 160         {"\x80\x80", -1, ""},
 161         // illegal runes
 162         {"\xff", 0, ""},
 163         {"\u0300\xff", 2, ""},
 164         {"\u0300\xc0\x80\x80", 2, ""},
 165         // boundaries
 166         {"a", 0, ""},
 167         {"\u0300a", 2, ""},
 168         // Hangul
 169         {"\u1103\u1161", 0, ""},
 170         {"\u110B\u1173\u11B7", 0, ""},
 171         {"\u1161\u110B\u1173\u11B7", 3, ""},
 172         {"\u1173\u11B7\u1103\u1161", 6, ""},
 173         // too many combining characters.
 174         {grave(maxNonStarters - 1), -1, ""},
 175         {grave(maxNonStarters), 60, ""},
 176         {grave(maxNonStarters + 1), 60, ""},
 177 }
 178
 179 func firstBoundaryF(rb *reorderBuffer, s string) (int, []byte) {
 180         return rb.f.form.FirstBoundary([]byte(s)), nil
 181 }
 182
 183 func firstBoundaryStringF(rb *reorderBuffer, s string) (int, []byte) {
 184         return rb.f.form.FirstBoundaryInString(s), nil
 185 }
 186
 187 func TestFirstBoundary(t *testing.T) {
 188         runPosTests(t, "TestFirstBoundary", NFC, firstBoundaryF, firstBoundaryTests)
 189         runPosTests(t, "TestFirstBoundaryInString", NFC, firstBoundaryStringF, firstBoundaryTests)
 190 }
 191
 192 func TestNextBoundary(t *testing.T) {
 193         testCases := []struct {
 194                 input string
 195                 atEOF bool
 196                 want  int
 197         }{
 198                 // no boundary
 199                 {"", true, 0},
 200                 {"", false, -1},
 201                 {"\u0300", true, 2},
 202                 {"\u0300", false, -1},
 203                 {"\x80\x80", true, 1},
 204                 {"\x80\x80", false, 1},
 205                 // illegal runes
 206                 {"\xff", false, 1},
 207                 {"\u0300\xff", false, 2},
 208                 {"\u0300\xc0\x80\x80", false, 2},
 209                 {"\xc2\x80\x80", false, 2},
 210                 {"\xc2", false, -1},
 211                 {"\xc2", true, 1},
 212                 {"a\u0300\xc2", false, -1},
 213                 {"a\u0300\xc2", true, 3},
 214                 // boundaries
 215                 {"a", true, 1},
 216                 {"a", false, -1},
 217                 {"aa", false, 1},
 218                 {"\u0300", true, 2},
 219                 {"\u0300", false, -1},
 220                 {"\u0300a", false, 2},
 221                 // Hangul
 222                 {"\u1103\u1161", true, 6},
 223                 {"\u1103\u1161", false, -1},
 224                 {"\u110B\u1173\u11B7", false, -1},
 225                 {"\u110B\u1173\u11B7\u110B\u1173\u11B7", false, 9},
 226                 {"\u1161\u110B\u1173\u11B7", false, 3},
 227                 {"\u1173\u11B7\u1103\u1161", false, 6},
 228                 // too many combining characters.
 229                 {grave(maxNonStarters - 1), false, -1},
 230                 {grave(maxNonStarters), false, 60},
 231                 {grave(maxNonStarters + 1), false, 60},
 232         }
 233
 234         for _, tc := range testCases {
 235                 if got := NFC.NextBoundary([]byte(tc.input), tc.atEOF); got != tc.want {
 236                         t.Errorf("NextBoundary(%+q, %v) = %d; want %d", tc.input, tc.atEOF, got, tc.want)
 237                 }
 238                 if got := NFC.NextBoundaryInString(tc.input, tc.atEOF); got != tc.want {
 239                         t.Errorf("NextBoundaryInString(%+q, %v) = %d; want %d", tc.input, tc.atEOF, got, tc.want)
 240                 }
 241         }
 242 }
 243
 244 var decomposeToLastTests = []PositionTest{
 245         // ends with inert character
 246         {"Hello!", 6, ""},
 247         {"\u0632", 2, ""},
 248         {"a\u0301\u0635", 5, ""},
 249         // ends with non-inert starter
 250         {"a", 0, "a"},
 251         {"a\u0301a", 3, "a"},
 252         {"a\u0301\u03B9", 3, "\u03B9"},
 253         {"a\u0327", 0, "a\u0327"},
 254         // illegal runes
 255         {"\xFF", 1, ""},
 256         {"aa\xFF", 3, ""},
 257         {"\xC0\x80\x80", 3, ""},
 258         {"\xCC\x80\x80", 3, ""},
 259         // ends with incomplete UTF-8 encoding
 260         {"a\xCC", 2, ""},
 261         // ends with combining characters
 262         {"\u0300\u0301", 0, "\u0300\u0301"},
 263         {"a\u0300\u0301", 0, "a\u0300\u0301"},
 264         {"a\u0301\u0308", 0, "a\u0301\u0308"},
 265         {"a\u0308\u0301", 0, "a\u0308\u0301"},
 266         {"aaaa\u0300\u0301", 3, "a\u0300\u0301"},
 267         {"\u0300a\u0300\u0301", 2, "a\u0300\u0301"},
 268         {"\u00C0", 0, "A\u0300"},
 269         {"a\u00C0", 1, "A\u0300"},
 270         // decomposing
 271         {"a\u0300\u00E0", 3, "a\u0300"},
 272         // multisegment decompositions (flushes leading segments)
 273         {"a\u0300\uFDC0", 7, "\u064A"},
 274         {"\uFDC0" + grave(29), 4, "\u064A" + grave(29)},
 275         {"\uFDC0" + grave(30), 4, "\u064A" + grave(30)},
 276         {"\uFDC0" + grave(31), 5, grave(30)},
 277         {"\uFDFA" + grave(14), 31, "\u0645" + grave(14)},
 278         // Overflow
 279         {"\u00E0" + grave(29), 0, "a" + grave(30)},
 280         {"\u00E0" + grave(30), 2, grave(30)},
 281         // Hangul
 282         {"a\u1103", 1, "\u1103"},
 283         {"a\u110B", 1, "\u110B"},
 284         {"a\u110B\u1173", 1, "\u110B\u1173"},
 285         // See comment in composition.go:compBoundaryAfter.
 286         {"a\u110B\u1173\u11B7", 1, "\u110B\u1173\u11B7"},
 287         {"a\uC73C", 1, "\u110B\u1173"},
 288         {"다음", 3, "\u110B\u1173\u11B7"},
 289         {"다", 0, "\u1103\u1161"},
 290         {"\u1103\u1161\u110B\u1173\u11B7", 6, "\u110B\u1173\u11B7"},
 291         {"\u110B\u1173\u11B7\u1103\u1161", 9, "\u1103\u1161"},
 292         {"다음음", 6, "\u110B\u1173\u11B7"},
 293         {"음다다", 6, "\u1103\u1161"},
 294         // maximized buffer
 295         {"a" + grave(30), 0, "a" + grave(30)},
 296         // Buffer overflow
 297         {"a" + grave(31), 3, grave(30)},
 298         // weird UTF-8
 299         {"a\u0300\u11B7", 0, "a\u0300\u11B7"},
 300 }
 301
 302 func decomposeToLast(rb *reorderBuffer, s string) (int, []byte) {
 303         rb.setFlusher([]byte(s), appendFlush)
 304         decomposeToLastBoundary(rb)
 305         buf := rb.flush(nil)
 306         return len(rb.out), buf
 307 }
 308
 309 func TestDecomposeToLastBoundary(t *testing.T) {
 310         runPosTests(t, "TestDecomposeToLastBoundary", NFKC, decomposeToLast, decomposeToLastTests)
 311 }
 312
 313 var lastBoundaryTests = []PositionTest{
 314         // ends with inert character
 315         {"Hello!", 6, ""},
 316         {"\u0632", 2, ""},
 317         // ends with non-inert starter
 318         {"a", 0, ""},
 319         // illegal runes
 320         {"\xff", 1, ""},
 321         {"aa\xff", 3, ""},
 322         {"a\xff\u0300", 1, ""}, // TODO: should probably be 2.
 323         {"\xc0\x80\x80", 3, ""},
 324         {"\xc0\x80\x80\u0300", 3, ""},
 325         // ends with incomplete UTF-8 encoding
 326         {"\xCC", -1, ""},
 327         {"\xE0\x80", -1, ""},
 328         {"\xF0\x80\x80", -1, ""},
 329         {"a\xCC", 0, ""},
 330         {"\x80\xCC", 1, ""},
 331         {"\xCC\xCC", 1, ""},
 332         // ends with combining characters
 333         {"a\u0300\u0301", 0, ""},
 334         {"aaaa\u0300\u0301", 3, ""},
 335         {"\u0300a\u0300\u0301", 2, ""},
 336         {"\u00C2", 0, ""},
 337         {"a\u00C2", 1, ""},
 338         // decomposition may recombine
 339         {"\u0226", 0, ""},
 340         // no boundary
 341         {"", -1, ""},
 342         {"\u0300\u0301", -1, ""},
 343         {"\u0300", -1, ""},
 344         {"\x80\x80", -1, ""},
 345         {"\x80\x80\u0301", -1, ""},
 346         // Hangul
 347         {"다음", 3, ""},
 348         {"다", 0, ""},
 349         {"\u1103\u1161\u110B\u1173\u11B7", 6, ""},
 350         {"\u110B\u1173\u11B7\u1103\u1161", 9, ""},
 351         // too many combining characters.
 352         {grave(maxNonStarters - 1), -1, ""},
 353         // May still be preceded with a non-starter.
 354         {grave(maxNonStarters), -1, ""},
 355         // May still need to insert a cgj after the last combiner.
 356         {grave(maxNonStarters + 1), 2, ""},
 357         {grave(maxNonStarters + 2), 4, ""},
 358
 359         {"a" + grave(maxNonStarters-1), 0, ""},
 360         {"a" + grave(maxNonStarters), 0, ""},
 361         // May still need to insert a cgj after the last combiner.
 362         {"a" + grave(maxNonStarters+1), 3, ""},
 363         {"a" + grave(maxNonStarters+2), 5, ""},
 364 }
 365
 366 func lastBoundaryF(rb *reorderBuffer, s string) (int, []byte) {
 367         return rb.f.form.LastBoundary([]byte(s)), nil
 368 }
 369
 370 func TestLastBoundary(t *testing.T) {
 371         runPosTests(t, "TestLastBoundary", NFC, lastBoundaryF, lastBoundaryTests)
 372 }
 373
 374 type spanTest struct {
 375         input string
 376         atEOF bool
 377         n     int
 378         err   error
 379 }
 380
 381 var quickSpanTests = []spanTest{
 382         {"", true, 0, nil},
 383         // starters
 384         {"a", true, 1, nil},
 385         {"abc", true, 3, nil},
 386         {"\u043Eb", true, 3, nil},
 387         // incomplete last rune.
 388         {"\xCC", true, 1, nil},
 389         {"\xCC", false, 0, transform.ErrShortSrc},
 390         {"a\xCC", true, 2, nil},
 391         {"a\xCC", false, 0, transform.ErrShortSrc}, // TODO: could be 1 for NFD
 392         // incorrectly ordered combining characters
 393         {"\u0300\u0316", true, 0, transform.ErrEndOfSpan},
 394         {"\u0300\u0316", false, 0, transform.ErrEndOfSpan},
 395         {"\u0300\u0316cd", true, 0, transform.ErrEndOfSpan},
 396         {"\u0300\u0316cd", false, 0, transform.ErrEndOfSpan},
 397         // have a maximum number of combining characters.
 398         {rep(0x035D, 30) + "\u035B", true, 0, transform.ErrEndOfSpan},
 399         {"a" + rep(0x035D, 30) + "\u035B", true, 0, transform.ErrEndOfSpan},
 400         {"Ɵ" + rep(0x035D, 30) + "\u035B", true, 0, transform.ErrEndOfSpan},
 401         {"aa" + rep(0x035D, 30) + "\u035B", true, 1, transform.ErrEndOfSpan},
 402         {rep(0x035D, 30) + cgj + "\u035B", true, 64, nil},
 403         {"a" + rep(0x035D, 30) + cgj + "\u035B", true, 65, nil},
 404         {"Ɵ" + rep(0x035D, 30) + cgj + "\u035B", true, 66, nil},
 405         {"aa" + rep(0x035D, 30) + cgj + "\u035B", true, 66, nil},
 406
 407         {"a" + rep(0x035D, 30) + cgj + "\u035B", false, 61, transform.ErrShortSrc},
 408         {"Ɵ" + rep(0x035D, 30) + cgj + "\u035B", false, 62, transform.ErrShortSrc},
 409         {"aa" + rep(0x035D, 30) + cgj + "\u035B", false, 62, transform.ErrShortSrc},
 410 }
 411
 412 var quickSpanNFDTests = []spanTest{
 413         // needs decomposing
 414         {"\u00C0", true, 0, transform.ErrEndOfSpan},
 415         {"abc\u00C0", true, 3, transform.ErrEndOfSpan},
 416         // correctly ordered combining characters
 417         {"\u0300", true, 2, nil},
 418         {"ab\u0300", true, 4, nil},
 419         {"ab\u0300cd", true, 6, nil},
 420         {"\u0300cd", true, 4, nil},
 421         {"\u0316\u0300", true, 4, nil},
 422         {"ab\u0316\u0300", true, 6, nil},
 423         {"ab\u0316\u0300cd", true, 8, nil},
 424         {"ab\u0316\u0300\u00C0", true, 6, transform.ErrEndOfSpan},
 425         {"\u0316\u0300cd", true, 6, nil},
 426         {"\u043E\u0308b", true, 5, nil},
 427         // incorrectly ordered combining characters
 428         {"ab\u0300\u0316", true, 1, transform.ErrEndOfSpan}, // TODO: we could skip 'b' as well.
 429         {"ab\u0300\u0316cd", true, 1, transform.ErrEndOfSpan},
 430         // Hangul
 431         {"같은", true, 0, transform.ErrEndOfSpan},
 432 }
 433
 434 var quickSpanNFCTests = []spanTest{
 435         // okay composed
 436         {"\u00C0", true, 2, nil},
 437         {"abc\u00C0", true, 5, nil},
 438         // correctly ordered combining characters
 439         // TODO: b may combine with modifiers, which is why this fails. We could
 440         // make a more precise test that that actually checks whether last
 441         // characters combines. Probably not worth it.
 442         {"ab\u0300", true, 1, transform.ErrEndOfSpan},
 443         {"ab\u0300cd", true, 1, transform.ErrEndOfSpan},
 444         {"ab\u0316\u0300", true, 1, transform.ErrEndOfSpan},
 445         {"ab\u0316\u0300cd", true, 1, transform.ErrEndOfSpan},
 446         {"\u00C0\u035D", true, 4, nil},
 447         // we do not special case leading combining characters
 448         {"\u0300cd", true, 0, transform.ErrEndOfSpan},
 449         {"\u0300", true, 0, transform.ErrEndOfSpan},
 450         {"\u0316\u0300", true, 0, transform.ErrEndOfSpan},
 451         {"\u0316\u0300cd", true, 0, transform.ErrEndOfSpan},
 452         // incorrectly ordered combining characters
 453         {"ab\u0300\u0316", true, 1, transform.ErrEndOfSpan},
 454         {"ab\u0300\u0316cd", true, 1, transform.ErrEndOfSpan},
 455         // Hangul
 456         {"같은", true, 6, nil},
 457         {"같은", false, 3, transform.ErrShortSrc},
 458         // We return the start of the violating segment in case of overflow.
 459         {grave(30) + "\uff9e", true, 0, transform.ErrEndOfSpan},
 460         {grave(30), true, 0, transform.ErrEndOfSpan},
 461 }
 462
 463 func runSpanTests(t *testing.T, name string, f Form, testCases []spanTest) {
 464         for i, tc := range testCases {
 465                 s := fmt.Sprintf("Bytes/%s/%d=%+q/atEOF=%v", name, i, pc(tc.input), tc.atEOF)
 466                 ok := testtext.Run(t, s, func(t *testing.T) {
 467                         n, err := f.Span([]byte(tc.input), tc.atEOF)
 468                         if n != tc.n || err != tc.err {
 469                                 t.Errorf("\n got %d, %v;\nwant %d, %v", n, err, tc.n, tc.err)
 470                         }
 471                 })
 472                 if !ok {
 473                         continue // Don't do the String variant if the Bytes variant failed.
 474                 }
 475                 s = fmt.Sprintf("String/%s/%d=%+q/atEOF=%v", name, i, pc(tc.input), tc.atEOF)
 476                 testtext.Run(t, s, func(t *testing.T) {
 477                         n, err := f.SpanString(tc.input, tc.atEOF)
 478                         if n != tc.n || err != tc.err {
 479                                 t.Errorf("\n got %d, %v;\nwant %d, %v", n, err, tc.n, tc.err)
 480                         }
 481                 })
 482         }
 483 }
 484
 485 func TestSpan(t *testing.T) {
 486         runSpanTests(t, "NFD", NFD, quickSpanTests)
 487         runSpanTests(t, "NFD", NFD, quickSpanNFDTests)
 488         runSpanTests(t, "NFC", NFC, quickSpanTests)
 489         runSpanTests(t, "NFC", NFC, quickSpanNFCTests)
 490 }
 491
 492 var isNormalTests = []PositionTest{
 493         {"", 1, ""},
 494         // illegal runes
 495         {"\xff", 1, ""},
 496         // starters
 497         {"a", 1, ""},
 498         {"abc", 1, ""},
 499         {"\u043Eb", 1, ""},
 500         // incorrectly ordered combining characters
 501         {"\u0300\u0316", 0, ""},
 502         {"ab\u0300\u0316", 0, ""},
 503         {"ab\u0300\u0316cd", 0, ""},
 504         {"\u0300\u0316cd", 0, ""},
 505 }
 506 var isNormalNFDTests = []PositionTest{
 507         // needs decomposing
 508         {"\u00C0", 0, ""},
 509         {"abc\u00C0", 0, ""},
 510         // correctly ordered combining characters
 511         {"\u0300", 1, ""},
 512         {"ab\u0300", 1, ""},
 513         {"ab\u0300cd", 1, ""},
 514         {"\u0300cd", 1, ""},
 515         {"\u0316\u0300", 1, ""},
 516         {"ab\u0316\u0300", 1, ""},
 517         {"ab\u0316\u0300cd", 1, ""},
 518         {"\u0316\u0300cd", 1, ""},
 519         {"\u043E\u0308b", 1, ""},
 520         // Hangul
 521         {"같은", 0, ""},
 522 }
 523 var isNormalNFCTests = []PositionTest{
 524         // okay composed
 525         {"\u00C0", 1, ""},
 526         {"abc\u00C0", 1, ""},
 527         // need reordering
 528         {"a\u0300", 0, ""},
 529         {"a\u0300cd", 0, ""},
 530         {"a\u0316\u0300", 0, ""},
 531         {"a\u0316\u0300cd", 0, ""},
 532         // correctly ordered combining characters
 533         {"ab\u0300", 1, ""},
 534         {"ab\u0300cd", 1, ""},
 535         {"ab\u0316\u0300", 1, ""},
 536         {"ab\u0316\u0300cd", 1, ""},
 537         {"\u00C0\u035D", 1, ""},
 538         {"\u0300", 1, ""},
 539         {"\u0316\u0300cd", 1, ""},
 540         // Hangul
 541         {"같은", 1, ""},
 542 }
 543
 544 var isNormalNFKXTests = []PositionTest{
 545         // Special case.
 546         {"\u00BC", 0, ""},
 547 }
 548
 549 func isNormalF(rb *reorderBuffer, s string) (int, []byte) {
 550         if rb.f.form.IsNormal([]byte(s)) {
 551                 return 1, nil
 552         }
 553         return 0, nil
 554 }
 555
 556 func isNormalStringF(rb *reorderBuffer, s string) (int, []byte) {
 557         if rb.f.form.IsNormalString(s) {
 558                 return 1, nil
 559         }
 560         return 0, nil
 561 }
 562
 563 func TestIsNormal(t *testing.T) {
 564         runPosTests(t, "TestIsNormalNFD1", NFD, isNormalF, isNormalTests)
 565         runPosTests(t, "TestIsNormalNFD2", NFD, isNormalF, isNormalNFDTests)
 566         runPosTests(t, "TestIsNormalNFC1", NFC, isNormalF, isNormalTests)
 567         runPosTests(t, "TestIsNormalNFC2", NFC, isNormalF, isNormalNFCTests)
 568         runPosTests(t, "TestIsNormalNFKD1", NFKD, isNormalF, isNormalTests)
 569         runPosTests(t, "TestIsNormalNFKD2", NFKD, isNormalF, isNormalNFDTests)
 570         runPosTests(t, "TestIsNormalNFKD3", NFKD, isNormalF, isNormalNFKXTests)
 571         runPosTests(t, "TestIsNormalNFKC1", NFKC, isNormalF, isNormalTests)
 572         runPosTests(t, "TestIsNormalNFKC2", NFKC, isNormalF, isNormalNFCTests)
 573         runPosTests(t, "TestIsNormalNFKC3", NFKC, isNormalF, isNormalNFKXTests)
 574 }
 575
 576 func TestIsNormalString(t *testing.T) {
 577         runPosTests(t, "TestIsNormalNFD1", NFD, isNormalStringF, isNormalTests)
 578         runPosTests(t, "TestIsNormalNFD2", NFD, isNormalStringF, isNormalNFDTests)
 579         runPosTests(t, "TestIsNormalNFC1", NFC, isNormalStringF, isNormalTests)
 580         runPosTests(t, "TestIsNormalNFC2", NFC, isNormalStringF, isNormalNFCTests)
 581 }
 582
 583 type AppendTest struct {
 584         left  string
 585         right string
 586         out   string
 587 }
 588
 589 type appendFunc func(f Form, out []byte, s string) []byte
 590
 591 var fstr = []string{"NFC", "NFD", "NFKC", "NFKD"}
 592
 593 func runNormTests(t *testing.T, name string, fn appendFunc) {
 594         for f := NFC; f <= NFKD; f++ {
 595                 runAppendTests(t, name, f, fn, normTests[f])
 596         }
 597 }
 598
 599 func runAppendTests(t *testing.T, name string, f Form, fn appendFunc, tests []AppendTest) {
 600         for i, test := range tests {
 601                 t.Run(fmt.Sprintf("%s/%d", fstr[f], i), func(t *testing.T) {
 602                         id := pc(test.left + test.right)
 603                         if *testn >= 0 && i != *testn {
 604                                 return
 605                         }
 606                         t.Run("fn", func(t *testing.T) {
 607                                 out := []byte(test.left)
 608                                 have := string(fn(f, out, test.right))
 609                                 if len(have) != len(test.out) {
 610                                         t.Errorf("%+q: length is %d; want %d (%+q vs %+q)", id, len(have), len(test.out), pc(have), pc(test.out))
 611                                 }
 612                                 if have != test.out {
 613                                         k, pf := pidx(have, test.out)
 614                                         t.Errorf("%+q:\nwas  %s%+q; \nwant %s%+q", id, pf, pc(have[k:]), pf, pc(test.out[k:]))
 615                                 }
 616                         })
 617
 618                         // Bootstrap by normalizing input. Ensures that the various variants
 619                         // behave the same.
 620                         for g := NFC; g <= NFKD; g++ {
 621                                 if f == g {
 622                                         continue
 623                                 }
 624                                 t.Run(fstr[g], func(t *testing.T) {
 625                                         want := g.String(test.left + test.right)
 626                                         have := string(fn(g, g.AppendString(nil, test.left), test.right))
 627                                         if len(have) != len(want) {
 628                                                 t.Errorf("%+q: length is %d; want %d (%+q vs %+q)", id, len(have), len(want), pc(have), pc(want))
 629                                         }
 630                                         if have != want {
 631                                                 k, pf := pidx(have, want)
 632                                                 t.Errorf("%+q:\nwas  %s%+q; \nwant %s%+q", id, pf, pc(have[k:]), pf, pc(want[k:]))
 633                                         }
 634                                 })
 635                         }
 636                 })
 637         }
 638 }
 639
 640 var normTests = [][]AppendTest{
 641         appendTestsNFC,
 642         appendTestsNFD,
 643         appendTestsNFKC,
 644         appendTestsNFKD,
 645 }
 646
 647 var appendTestsNFC = []AppendTest{
 648         {"", ascii, ascii},
 649         {"", txt_all, txt_all},
 650         {"\uff9e", grave(30), "\uff9e" + grave(29) + cgj + grave(1)},
 651         {grave(30), "\uff9e", grave(30) + cgj + "\uff9e"},
 652
 653         // Tests designed for Iter.
 654         { // ordering of non-composing combining characters
 655                 "",
 656                 "\u0305\u0316",
 657                 "\u0316\u0305",
 658         },
 659         { // segment overflow
 660                 "",
 661                 "a" + rep(0x0305, maxNonStarters+4) + "\u0316",
 662                 "a" + rep(0x0305, maxNonStarters) + cgj + "\u0316" + rep(0x305, 4),
 663         },
 664
 665         { // Combine across non-blocking non-starters.
 666                 // U+0327 COMBINING CEDILLA;Mn;202;NSM;;;;;N;NON-SPACING CEDILLA;;;;
 667                 // U+0325 COMBINING RING BELOW;Mn;220;NSM;;;;;N;NON-SPACING RING BELOW;;;;
 668                 "", "a\u0327\u0325", "\u1e01\u0327",
 669         },
 670
 671         { // Jamo V+T does not combine.
 672                 "",
 673                 "\u1161\u11a8",
 674                 "\u1161\u11a8",
 675         },
 676
 677         // Stability tests: see http://www.unicode.org/review/pr-29.html.
 678         {"", "\u0b47\u0300\u0b3e", "\u0b47\u0300\u0b3e"},
 679         {"", "\u1100\u0300\u1161", "\u1100\u0300\u1161"},
 680         {"", "\u0b47\u0b3e", "\u0b4b"},
 681         {"", "\u1100\u1161", "\uac00"},
 682
 683         // U+04DA MALAYALAM VOWEL SIGN O;Mc;0;L;0D46 0D3E;;;;N;;;;;
 684         { // 0d4a starts a new segment.
 685                 "",
 686                 "\u0d4a" + strings.Repeat("\u0d3e", 15) + "\u0d4a" + strings.Repeat("\u0d3e", 15),
 687                 "\u0d4a" + strings.Repeat("\u0d3e", 15) + "\u0d4a" + strings.Repeat("\u0d3e", 15),
 688         },
 689
 690         { // Split combining characters.
 691                 // TODO: don't insert CGJ before starters.
 692                 "",
 693                 "\u0d46" + strings.Repeat("\u0d3e", 31),
 694                 "\u0d4a" + strings.Repeat("\u0d3e", 29) + cgj + "\u0d3e",
 695         },
 696
 697         { // Split combining characters.
 698                 "",
 699                 "\u0d4a" + strings.Repeat("\u0d3e", 30),
 700                 "\u0d4a" + strings.Repeat("\u0d3e", 29) + cgj + "\u0d3e",
 701         },
 702
 703         { //  https://golang.org/issues/20079
 704                 "",
 705                 "\xeb\u0344",
 706                 "\xeb\u0308\u0301",
 707         },
 708
 709         { //  https://golang.org/issues/20079
 710                 "",
 711                 "\uac00" + strings.Repeat("\u0300", 30),
 712                 "\uac00" + strings.Repeat("\u0300", 29) + "\u034f\u0300",
 713         },
 714
 715         { //  https://golang.org/issues/20079
 716                 "",
 717                 "\xeb" + strings.Repeat("\u0300", 31),
 718                 "\xeb" + strings.Repeat("\u0300", 30) + "\u034f\u0300",
 719         },
 720 }
 721
 722 var appendTestsNFD = []AppendTest{
 723 // TODO: Move some of the tests here.
 724 }
 725
 726 var appendTestsNFKC = []AppendTest{
 727         // empty buffers
 728         {"", "", ""},
 729         {"a", "", "a"},
 730         {"", "a", "a"},
 731         {"", "\u0041\u0307\u0304", "\u01E0"},
 732         // segment split across buffers
 733         {"", "a\u0300b", "\u00E0b"},
 734         {"a", "\u0300b", "\u00E0b"},
 735         {"a", "\u0300\u0316", "\u00E0\u0316"},
 736         {"a", "\u0316\u0300", "\u00E0\u0316"},
 737         {"a", "\u0300a\u0300", "\u00E0\u00E0"},
 738         {"a", "\u0300a\u0300a\u0300", "\u00E0\u00E0\u00E0"},
 739         {"a", "\u0300aaa\u0300aaa\u0300", "\u00E0aa\u00E0aa\u00E0"},
 740         {"a\u0300", "\u0327", "\u00E0\u0327"},
 741         {"a\u0327", "\u0300", "\u00E0\u0327"},
 742         {"a\u0316", "\u0300", "\u00E0\u0316"},
 743         {"\u0041\u0307", "\u0304", "\u01E0"},
 744         // Hangul
 745         {"", "\u110B\u1173", "\uC73C"},
 746         {"", "\u1103\u1161", "\uB2E4"},
 747         {"", "\u110B\u1173\u11B7", "\uC74C"},
 748         {"", "\u320E", "\x28\uAC00\x29"},
 749         {"", "\x28\u1100\u1161\x29", "\x28\uAC00\x29"},
 750         {"\u1103", "\u1161", "\uB2E4"},
 751         {"\u110B", "\u1173\u11B7", "\uC74C"},
 752         {"\u110B\u1173", "\u11B7", "\uC74C"},
 753         {"\uC73C", "\u11B7", "\uC74C"},
 754         // UTF-8 encoding split across buffers
 755         {"a\xCC", "\x80", "\u00E0"},
 756         {"a\xCC", "\x80b", "\u00E0b"},
 757         {"a\xCC", "\x80a\u0300", "\u00E0\u00E0"},
 758         {"a\xCC", "\x80\x80", "\u00E0\x80"},
 759         {"a\xCC", "\x80\xCC", "\u00E0\xCC"},
 760         {"a\u0316\xCC", "\x80a\u0316\u0300", "\u00E0\u0316\u00E0\u0316"},
 761         // ending in incomplete UTF-8 encoding
 762         {"", "\xCC", "\xCC"},
 763         {"a", "\xCC", "a\xCC"},
 764         {"a", "b\xCC", "ab\xCC"},
 765         {"\u0226", "\xCC", "\u0226\xCC"},
 766         // illegal runes
 767         {"", "\x80", "\x80"},
 768         {"", "\x80\x80\x80", "\x80\x80\x80"},
 769         {"", "\xCC\x80\x80\x80", "\xCC\x80\x80\x80"},
 770         {"", "a\x80", "a\x80"},
 771         {"", "a\x80\x80\x80", "a\x80\x80\x80"},
 772         {"", "a\x80\x80\x80\x80\x80\x80", "a\x80\x80\x80\x80\x80\x80"},
 773         {"a", "\x80\x80\x80", "a\x80\x80\x80"},
 774         // overflow
 775         {"", strings.Repeat("\x80", 33), strings.Repeat("\x80", 33)},
 776         {strings.Repeat("\x80", 33), "", strings.Repeat("\x80", 33)},
 777         {strings.Repeat("\x80", 33), strings.Repeat("\x80", 33), strings.Repeat("\x80", 66)},
 778         // overflow of combining characters
 779         {"", grave(34), grave(30) + cgj + grave(4)},
 780         {"", grave(36), grave(30) + cgj + grave(6)},
 781         {grave(29), grave(5), grave(30) + cgj + grave(4)},
 782         {grave(30), grave(4), grave(30) + cgj + grave(4)},
 783         {grave(30), grave(3), grave(30) + cgj + grave(3)},
 784         {grave(30) + "\xCC", "\x80", grave(30) + cgj + grave(1)},
 785         {"", "\uFDFA" + grave(14), "\u0635\u0644\u0649 \u0627\u0644\u0644\u0647 \u0639\u0644\u064a\u0647 \u0648\u0633\u0644\u0645" + grave(14)},
 786         {"", "\uFDFA" + grave(28) + "\u0316", "\u0635\u0644\u0649 \u0627\u0644\u0644\u0647 \u0639\u0644\u064a\u0647 \u0648\u0633\u0644\u0645\u0316" + grave(28)},
 787         // - First rune has a trailing non-starter.
 788         {"\u00d5", grave(30), "\u00d5" + grave(29) + cgj + grave(1)},
 789         // - U+FF9E decomposes into a non-starter in compatibility mode. A CGJ must be
 790         //   inserted even when FF9E starts a new segment.
 791         {"\uff9e", grave(30), "\u3099" + grave(29) + cgj + grave(1)},
 792         {grave(30), "\uff9e", grave(30) + cgj + "\u3099"},
 793         // - Many non-starter decompositions in a row causing overflow.
 794         {"", rep(0x340, 31), rep(0x300, 30) + cgj + "\u0300"},
 795         {"", rep(0xFF9E, 31), rep(0x3099, 30) + cgj + "\u3099"},
 796
 797         {"", "\u0644\u0625" + rep(0x300, 31), "\u0644\u0625" + rep(0x300, 29) + cgj + "\u0300\u0300"},
 798         {"", "\ufef9" + rep(0x300, 31), "\u0644\u0625" + rep(0x300, 29) + cgj + rep(0x0300, 2)},
 799         {"", "\ufef9" + rep(0x300, 31), "\u0644\u0625" + rep(0x300, 29) + cgj + rep(0x0300, 2)},
 800
 801         // U+0F81 TIBETAN VOWEL SIGN REVERSED II splits into two modifiers.
 802         {"", "\u0f7f" + rep(0xf71, 29) + "\u0f81", "\u0f7f" + rep(0xf71, 29) + cgj + "\u0f71\u0f80"},
 803         {"", "\u0f7f" + rep(0xf71, 28) + "\u0f81", "\u0f7f" + rep(0xf71, 29) + "\u0f80"},
 804         {"", "\u0f7f" + rep(0xf81, 16), "\u0f7f" + rep(0xf71, 15) + rep(0xf80, 15) + cgj + "\u0f71\u0f80"},
 805
 806         // weird UTF-8
 807         {"\u00E0\xE1", "\x86", "\u00E0\xE1\x86"},
 808         {"a\u0300\u11B7", "\u0300", "\u00E0\u11B7\u0300"},
 809         {"a\u0300\u11B7\u0300", "\u0300", "\u00E0\u11B7\u0300\u0300"},
 810         {"\u0300", "\xF8\x80\x80\x80\x80\u0300", "\u0300\xF8\x80\x80\x80\x80\u0300"},
 811         {"\u0300", "\xFC\x80\x80\x80\x80\x80\u0300", "\u0300\xFC\x80\x80\x80\x80\x80\u0300"},
 812         {"\xF8\x80\x80\x80\x80\u0300", "\u0300", "\xF8\x80\x80\x80\x80\u0300\u0300"},
 813         {"\xFC\x80\x80\x80\x80\x80\u0300", "\u0300", "\xFC\x80\x80\x80\x80\x80\u0300\u0300"},
 814         {"\xF8\x80\x80\x80", "\x80\u0300\u0300", "\xF8\x80\x80\x80\x80\u0300\u0300"},
 815
 816         {"", strings.Repeat("a\u0316\u0300", 6), strings.Repeat("\u00E0\u0316", 6)},
 817         // large input.
 818         {"", strings.Repeat("a\u0300\u0316", 31), strings.Repeat("\u00E0\u0316", 31)},
 819         {"", strings.Repeat("a\u0300\u0316", 4000), strings.Repeat("\u00E0\u0316", 4000)},
 820         {"", strings.Repeat("\x80\x80", 4000), strings.Repeat("\x80\x80", 4000)},
 821         {"", "\u0041\u0307\u0304", "\u01E0"},
 822 }
 823
 824 var appendTestsNFKD = []AppendTest{
 825         {"", "a" + grave(64), "a" + grave(30) + cgj + grave(30) + cgj + grave(4)},
 826
 827         { // segment overflow on unchanged character
 828                 "",
 829                 "a" + grave(64) + "\u0316",
 830                 "a" + grave(30) + cgj + grave(30) + cgj + "\u0316" + grave(4),
 831         },
 832         { // segment overflow on unchanged character + start value
 833                 "",
 834                 "a" + grave(98) + "\u0316",
 835                 "a" + grave(30) + cgj + grave(30) + cgj + grave(30) + cgj + "\u0316" + grave(8),
 836         },
 837         { // segment overflow on decomposition. (U+0340 decomposes to U+0300.)
 838                 "",
 839                 "a" + grave(59) + "\u0340",
 840                 "a" + grave(30) + cgj + grave(30),
 841         },
 842         { // segment overflow on non-starter decomposition
 843                 "",
 844                 "a" + grave(33) + "\u0340" + grave(30) + "\u0320",
 845                 "a" + grave(30) + cgj + grave(30) + cgj + "\u0320" + grave(4),
 846         },
 847         { // start value after ASCII overflow
 848                 "",
 849                 rep('a', segSize) + grave(32) + "\u0320",
 850                 rep('a', segSize) + grave(30) + cgj + "\u0320" + grave(2),
 851         },
 852         { // Jamo overflow
 853                 "",
 854                 "\u1100\u1161" + grave(30) + "\u0320" + grave(2),
 855                 "\u1100\u1161" + grave(29) + cgj + "\u0320" + grave(3),
 856         },
 857         { // Hangul
 858                 "",
 859                 "\uac00",
 860                 "\u1100\u1161",
 861         },
 862         { // Hangul overflow
 863                 "",
 864                 "\uac00" + grave(32) + "\u0320",
 865                 "\u1100\u1161" + grave(29) + cgj + "\u0320" + grave(3),
 866         },
 867         { // Hangul overflow in Hangul mode.
 868                 "",
 869                 "\uac00\uac00" + grave(32) + "\u0320",
 870                 "\u1100\u1161\u1100\u1161" + grave(29) + cgj + "\u0320" + grave(3),
 871         },
 872         { // Hangul overflow in Hangul mode.
 873                 "",
 874                 strings.Repeat("\uac00", 3) + grave(32) + "\u0320",
 875                 strings.Repeat("\u1100\u1161", 3) + grave(29) + cgj + "\u0320" + grave(3),
 876         },
 877         { // start value after cc=0
 878                 "",
 879                 "您您" + grave(34) + "\u0320",
 880                 "您您" + grave(30) + cgj + "\u0320" + grave(4),
 881         },
 882         { // start value after normalization
 883                 "",
 884                 "\u0300\u0320a" + grave(34) + "\u0320",
 885                 "\u0320\u0300a" + grave(30) + cgj + "\u0320" + grave(4),
 886         },
 887         {
 888                 // U+0F81 TIBETAN VOWEL SIGN REVERSED II splits into two modifiers.
 889                 "",
 890                 "a\u0f7f" + rep(0xf71, 29) + "\u0f81",
 891                 "a\u0f7f" + rep(0xf71, 29) + cgj + "\u0f71\u0f80",
 892         },
 893 }
 894
 895 func TestAppend(t *testing.T) {
 896         runNormTests(t, "Append", func(f Form, out []byte, s string) []byte {
 897                 return f.Append(out, []byte(s)...)
 898         })
 899 }
 900
 901 func TestAppendString(t *testing.T) {
 902         runNormTests(t, "AppendString", func(f Form, out []byte, s string) []byte {
 903                 return f.AppendString(out, s)
 904         })
 905 }
 906
 907 func TestBytes(t *testing.T) {
 908         runNormTests(t, "Bytes", func(f Form, out []byte, s string) []byte {
 909                 buf := []byte{}
 910                 buf = append(buf, out...)
 911                 buf = append(buf, s...)
 912                 return f.Bytes(buf)
 913         })
 914 }
 915
 916 func TestString(t *testing.T) {
 917         runNormTests(t, "String", func(f Form, out []byte, s string) []byte {
 918                 outs := string(out) + s
 919                 return []byte(f.String(outs))
 920         })
 921 }
 922
 923 func TestLinking(t *testing.T) {
 924         const prog = `
 925         package main
 926         import "fmt"
 927         import "golang.org/x/text/unicode/norm"
 928         func main() { fmt.Println(norm.%s) }
 929         `
 930         baseline, errB := testtext.CodeSize(fmt.Sprintf(prog, "MaxSegmentSize"))
 931         withTables, errT := testtext.CodeSize(fmt.Sprintf(prog, `NFC.String("")`))
 932         if errB != nil || errT != nil {
 933                 t.Skipf("code size failed: %v and %v", errB, errT)
 934         }
 935         // Tables are at least 50K
 936         if d := withTables - baseline; d < 50*1024 {
 937                 t.Errorf("tables appear not to be dropped: %d - %d = %d",
 938                         withTables, baseline, d)
 939         }
 940 }
 941
 942 func appendBench(f Form, in []byte) func() {
 943         buf := make([]byte, 0, 4*len(in))
 944         return func() {
 945                 f.Append(buf, in...)
 946         }
 947 }
 948
 949 func bytesBench(f Form, in []byte) func() {
 950         return func() {
 951                 f.Bytes(in)
 952         }
 953 }
 954
 955 func iterBench(f Form, in []byte) func() {
 956         iter := Iter{}
 957         return func() {
 958                 iter.Init(f, in)
 959                 for !iter.Done() {
 960                         iter.Next()
 961                 }
 962         }
 963 }
 964
 965 func transformBench(f Form, in []byte) func() {
 966         buf := make([]byte, 4*len(in))
 967         return func() {
 968                 if _, n, err := f.Transform(buf, in, true); err != nil || len(in) != n {
 969                         log.Panic(n, len(in), err)
 970                 }
 971         }
 972 }
 973
 974 func readerBench(f Form, in []byte) func() {
 975         buf := make([]byte, 4*len(in))
 976         return func() {
 977                 r := f.Reader(bytes.NewReader(in))
 978                 var err error
 979                 for err == nil {
 980                         _, err = r.Read(buf)
 981                 }
 982                 if err != io.EOF {
 983                         panic("")
 984                 }
 985         }
 986 }
 987
 988 func writerBench(f Form, in []byte) func() {
 989         buf := make([]byte, 0, 4*len(in))
 990         return func() {
 991                 r := f.Writer(bytes.NewBuffer(buf))
 992                 if _, err := r.Write(in); err != nil {
 993                         panic("")
 994                 }
 995         }
 996 }
 997
 998 func appendBenchmarks(bm []func(), f Form, in []byte) []func() {
 999         bm = append(bm, appendBench(f, in))
1000         bm = append(bm, iterBench(f, in))
1001         bm = append(bm, transformBench(f, in))
1002         bm = append(bm, readerBench(f, in))
1003         bm = append(bm, writerBench(f, in))
1004         return bm
1005 }
1006
1007 func doFormBenchmark(b *testing.B, inf, f Form, s string) {
1008         b.StopTimer()
1009         in := inf.Bytes([]byte(s))
1010         bm := appendBenchmarks(nil, f, in)
1011         b.SetBytes(int64(len(in) * len(bm)))
1012         b.StartTimer()
1013         for i := 0; i < b.N; i++ {
1014                 for _, fn := range bm {
1015                         fn()
1016                 }
1017         }
1018 }
1019
1020 func doSingle(b *testing.B, f func(Form, []byte) func(), s []byte) {
1021         b.StopTimer()
1022         fn := f(NFC, s)
1023         b.SetBytes(int64(len(s)))
1024         b.StartTimer()
1025         for i := 0; i < b.N; i++ {
1026                 fn()
1027         }
1028 }
1029
1030 var (
1031         smallNoChange = []byte("nörmalization")
1032         smallChange   = []byte("No\u0308rmalization")
1033         ascii         = strings.Repeat("There is nothing to change here! ", 500)
1034 )
1035
1036 func lowerBench(f Form, in []byte) func() {
1037         // Use package strings instead of bytes as it doesn't allocate memory
1038         // if there aren't any changes.
1039         s := string(in)
1040         return func() {
1041                 strings.ToLower(s)
1042         }
1043 }
1044
1045 func BenchmarkLowerCaseNoChange(b *testing.B) {
1046         doSingle(b, lowerBench, smallNoChange)
1047 }
1048 func BenchmarkLowerCaseChange(b *testing.B) {
1049         doSingle(b, lowerBench, smallChange)
1050 }
1051
1052 func quickSpanBench(f Form, in []byte) func() {
1053         return func() {
1054                 f.QuickSpan(in)
1055         }
1056 }
1057
1058 func BenchmarkQuickSpanChangeNFC(b *testing.B) {
1059         doSingle(b, quickSpanBench, smallNoChange)
1060 }
1061
1062 func BenchmarkBytesNoChangeNFC(b *testing.B) {
1063         doSingle(b, bytesBench, smallNoChange)
1064 }
1065 func BenchmarkBytesChangeNFC(b *testing.B) {
1066         doSingle(b, bytesBench, smallChange)
1067 }
1068
1069 func BenchmarkAppendNoChangeNFC(b *testing.B) {
1070         doSingle(b, appendBench, smallNoChange)
1071 }
1072 func BenchmarkAppendChangeNFC(b *testing.B) {
1073         doSingle(b, appendBench, smallChange)
1074 }
1075 func BenchmarkAppendLargeNFC(b *testing.B) {
1076         doSingle(b, appendBench, txt_all_bytes)
1077 }
1078
1079 func BenchmarkIterNoChangeNFC(b *testing.B) {
1080         doSingle(b, iterBench, smallNoChange)
1081 }
1082 func BenchmarkIterChangeNFC(b *testing.B) {
1083         doSingle(b, iterBench, smallChange)
1084 }
1085 func BenchmarkIterLargeNFC(b *testing.B) {
1086         doSingle(b, iterBench, txt_all_bytes)
1087 }
1088
1089 func BenchmarkTransformNoChangeNFC(b *testing.B) {
1090         doSingle(b, transformBench, smallNoChange)
1091 }
1092 func BenchmarkTransformChangeNFC(b *testing.B) {
1093         doSingle(b, transformBench, smallChange)
1094 }
1095 func BenchmarkTransformLargeNFC(b *testing.B) {
1096         doSingle(b, transformBench, txt_all_bytes)
1097 }
1098
1099 func BenchmarkNormalizeAsciiNFC(b *testing.B) {
1100         doFormBenchmark(b, NFC, NFC, ascii)
1101 }
1102 func BenchmarkNormalizeAsciiNFD(b *testing.B) {
1103         doFormBenchmark(b, NFC, NFD, ascii)
1104 }
1105 func BenchmarkNormalizeAsciiNFKC(b *testing.B) {
1106         doFormBenchmark(b, NFC, NFKC, ascii)
1107 }
1108 func BenchmarkNormalizeAsciiNFKD(b *testing.B) {
1109         doFormBenchmark(b, NFC, NFKD, ascii)
1110 }
1111
1112 func BenchmarkNormalizeNFC2NFC(b *testing.B) {
1113         doFormBenchmark(b, NFC, NFC, txt_all)
1114 }
1115 func BenchmarkNormalizeNFC2NFD(b *testing.B) {
1116         doFormBenchmark(b, NFC, NFD, txt_all)
1117 }
1118 func BenchmarkNormalizeNFD2NFC(b *testing.B) {
1119         doFormBenchmark(b, NFD, NFC, txt_all)
1120 }
1121 func BenchmarkNormalizeNFD2NFD(b *testing.B) {
1122         doFormBenchmark(b, NFD, NFD, txt_all)
1123 }
1124
1125 // Hangul is often special-cased, so we test it separately.
1126 func BenchmarkNormalizeHangulNFC2NFC(b *testing.B) {
1127         doFormBenchmark(b, NFC, NFC, txt_kr)
1128 }
1129 func BenchmarkNormalizeHangulNFC2NFD(b *testing.B) {
1130         doFormBenchmark(b, NFC, NFD, txt_kr)
1131 }
1132 func BenchmarkNormalizeHangulNFD2NFC(b *testing.B) {
1133         doFormBenchmark(b, NFD, NFC, txt_kr)
1134 }
1135 func BenchmarkNormalizeHangulNFD2NFD(b *testing.B) {
1136         doFormBenchmark(b, NFD, NFD, txt_kr)
1137 }
1138
1139 var forms = []Form{NFC, NFD, NFKC, NFKD}
1140
1141 func doTextBenchmark(b *testing.B, s string) {
1142         b.StopTimer()
1143         in := []byte(s)
1144         bm := []func(){}
1145         for _, f := range forms {
1146                 bm = appendBenchmarks(bm, f, in)
1147         }
1148         b.SetBytes(int64(len(s) * len(bm)))
1149         b.StartTimer()
1150         for i := 0; i < b.N; i++ {
1151                 for _, f := range bm {
1152                         f()
1153                 }
1154         }
1155 }
1156
1157 func BenchmarkCanonicalOrdering(b *testing.B) {
1158         doTextBenchmark(b, txt_canon)
1159 }
1160 func BenchmarkExtendedLatin(b *testing.B) {
1161         doTextBenchmark(b, txt_vn)
1162 }
1163 func BenchmarkMiscTwoByteUtf8(b *testing.B) {
1164         doTextBenchmark(b, twoByteUtf8)
1165 }
1166 func BenchmarkMiscThreeByteUtf8(b *testing.B) {
1167         doTextBenchmark(b, threeByteUtf8)
1168 }
1169 func BenchmarkHangul(b *testing.B) {
1170         doTextBenchmark(b, txt_kr)
1171 }
1172 func BenchmarkJapanese(b *testing.B) {
1173         doTextBenchmark(b, txt_jp)
1174 }
1175 func BenchmarkChinese(b *testing.B) {
1176         doTextBenchmark(b, txt_cn)
1177 }
1178 func BenchmarkOverflow(b *testing.B) {
1179         doTextBenchmark(b, overflow)
1180 }
1181
1182 var overflow = string(bytes.Repeat([]byte("\u035D"), 4096)) + "\u035B"
1183
1184 // Tests sampled from the Canonical ordering tests (Part 2) of
1185 // http://unicode.org/Public/UNIDATA/NormalizationTest.txt
1186 const txt_canon = `\u0061\u0315\u0300\u05AE\u0300\u0062 \u0061\u0300\u0315\u0300\u05AE\u0062
1187 \u0061\u0302\u0315\u0300\u05AE\u0062 \u0061\u0307\u0315\u0300\u05AE\u0062
1188 \u0061\u0315\u0300\u05AE\u030A\u0062 \u0061\u059A\u0316\u302A\u031C\u0062
1189 \u0061\u032E\u059A\u0316\u302A\u0062 \u0061\u0338\u093C\u0334\u0062
1190 \u0061\u059A\u0316\u302A\u0339       \u0061\u0341\u0315\u0300\u05AE\u0062
1191 \u0061\u0348\u059A\u0316\u302A\u0062 \u0061\u0361\u0345\u035D\u035C\u0062
1192 \u0061\u0366\u0315\u0300\u05AE\u0062 \u0061\u0315\u0300\u05AE\u0486\u0062
1193 \u0061\u05A4\u059A\u0316\u302A\u0062 \u0061\u0315\u0300\u05AE\u0613\u0062
1194 \u0061\u0315\u0300\u05AE\u0615\u0062 \u0061\u0617\u0315\u0300\u05AE\u0062
1195 \u0061\u0619\u0618\u064D\u064E\u0062 \u0061\u0315\u0300\u05AE\u0654\u0062
1196 \u0061\u0315\u0300\u05AE\u06DC\u0062 \u0061\u0733\u0315\u0300\u05AE\u0062
1197 \u0061\u0744\u059A\u0316\u302A\u0062 \u0061\u0315\u0300\u05AE\u0745\u0062
1198 \u0061\u09CD\u05B0\u094D\u3099\u0062 \u0061\u0E38\u0E48\u0E38\u0C56\u0062
1199 \u0061\u0EB8\u0E48\u0E38\u0E49\u0062 \u0061\u0F72\u0F71\u0EC8\u0F71\u0062
1200 \u0061\u1039\u05B0\u094D\u3099\u0062 \u0061\u05B0\u094D\u3099\u1A60\u0062
1201 \u0061\u3099\u093C\u0334\u1BE6\u0062 \u0061\u3099\u093C\u0334\u1C37\u0062
1202 \u0061\u1CD9\u059A\u0316\u302A\u0062 \u0061\u2DED\u0315\u0300\u05AE\u0062
1203 \u0061\u2DEF\u0315\u0300\u05AE\u0062 \u0061\u302D\u302E\u059A\u0316\u0062`
1204
1205 // Taken from http://creativecommons.org/licenses/by-sa/3.0/vn/
1206 const txt_vn = `Với các điều kiện sau: Ghi nhận công của tác giả.
1207 Nếu bạn sử dụng, chuyển đổi, hoặc xây dựng dự án từ
1208 nội dung được chia sẻ này, bạn phải áp dụng giấy phép này hoặc
1209 một giấy phép khác có các điều khoản tương tự như giấy phép này
1210 cho dự án của bạn. Hiểu rằng: Miễn — Bất kỳ các điều kiện nào
1211 trên đây cũng có thể được miễn bỏ nếu bạn được sự cho phép của
1212 người sở hữu bản quyền. Phạm vi công chúng — Khi tác phẩm hoặc
1213 bất kỳ chương nào của tác phẩm đã trong vùng dành cho công
1214 chúng theo quy định của pháp luật thì tình trạng của nó không
1215 bị ảnh hưởng bởi giấy phép trong bất kỳ trường hợp nào.`
1216
1217 // Taken from http://creativecommons.org/licenses/by-sa/1.0/deed.ru
1218 const txt_ru = `При обязательном соблюдении следующих условий:
1219 Attribution — Вы должны атрибутировать произведение (указывать
1220 автора и источник) в порядке, предусмотренном автором или
1221 лицензиаром (но только так, чтобы никоим образом не подразумевалось,
1222 что они поддерживают вас или использование вами данного произведения).
1223 Υπό τις ακόλουθες προϋποθέσεις:`
1224
1225 // Taken from http://creativecommons.org/licenses/by-sa/3.0/gr/
1226 const txt_gr = `Αναφορά Δημιουργού — Θα πρέπει να κάνετε την αναφορά στο έργο με τον
1227 τρόπο που έχει οριστεί από το δημιουργό ή το χορηγούντο την άδεια
1228 (χωρίς όμως να εννοείται με οποιονδήποτε τρόπο ότι εγκρίνουν εσάς ή
1229 τη χρήση του έργου από εσάς). Παρόμοια Διανομή — Εάν αλλοιώσετε,
1230 τροποποιήσετε ή δημιουργήσετε περαιτέρω βασισμένοι στο έργο θα
1231 μπορείτε να διανέμετε το έργο που θα προκύψει μόνο με την ίδια ή
1232 παρόμοια άδεια.`
1233
1234 // Taken from http://creativecommons.org/licenses/by-sa/3.0/deed.ar
1235 const txt_ar = `بموجب الشروط التالية نسب المصنف — يجب عليك أن
1236 تنسب العمل بالطريقة التي تحددها المؤلف أو المرخص (ولكن ليس بأي حال من
1237 الأحوال أن توحي وتقترح بتحول أو استخدامك للعمل).
1238 المشاركة على قدم المساواة — إذا كنت يعدل ، والتغيير ، أو الاستفادة
1239 من هذا العمل ، قد ينتج عن توزيع العمل إلا في ظل تشابه او تطابق فى واحد
1240 لهذا الترخيص.`
1241
1242 // Taken from http://creativecommons.org/licenses/by-sa/1.0/il/
1243 const txt_il = `בכפוף לתנאים הבאים: ייחוס — עליך לייחס את היצירה (לתת קרדיט) באופן
1244 המצויין על-ידי היוצר או מעניק הרישיון (אך לא בשום אופן המרמז על כך
1245 שהם תומכים בך או בשימוש שלך ביצירה). שיתוף זהה — אם תחליט/י לשנות,
1246 לעבד או ליצור יצירה נגזרת בהסתמך על יצירה זו, תוכל/י להפיץ את יצירתך
1247 החדשה רק תחת אותו הרישיון או רישיון דומה לרישיון זה.`
1248
1249 const twoByteUtf8 = txt_ru + txt_gr + txt_ar + txt_il
1250
1251 // Taken from http://creativecommons.org/licenses/by-sa/2.0/kr/
1252 const txt_kr = `다음과 같은 조건을 따라야 합니다: 저작자표시
1253 (Attribution) — 저작자나 이용허락자가 정한 방법으로 저작물의
1254 원저작자를 표시하여야 합니다(그러나 원저작자가 이용자나 이용자의
1255 이용을 보증하거나 추천한다는 의미로 표시해서는 안됩니다).
1256 동일조건변경허락 — 이 저작물을 이용하여 만든 이차적 저작물에는 본
1257 라이선스와 동일한 라이선스를 적용해야 합니다.`
1258
1259 // Taken from http://creativecommons.org/licenses/by-sa/3.0/th/
1260 const txt_th = `ภายใต้เงื่อนไข ดังต่อไปนี้ : แสดงที่มา — คุณต้องแสดงที่
1261 มาของงานดังกล่าว ตามรูปแบบที่ผู้สร้างสรรค์หรือผู้อนุญาตกำหนด (แต่
1262 ไม่ใช่ในลักษณะที่ว่า พวกเขาสนับสนุนคุณหรือสนับสนุนการที่
1263 คุณนำงานไปใช้) อนุญาตแบบเดียวกัน — หากคุณดัดแปลง เปลี่ยนรูป หรื
1264 อต่อเติมงานนี้ คุณต้องใช้สัญญาอนุญาตแบบเดียวกันหรือแบบที่เหมื
1265 อนกับสัญญาอนุญาตที่ใช้กับงานนี้เท่านั้น`
1266
1267 const threeByteUtf8 = txt_th
1268
1269 // Taken from http://creativecommons.org/licenses/by-sa/2.0/jp/
1270 const txt_jp = `あなたの従うべき条件は以下の通りです。
1271 表示 — あなたは原著作者のクレジットを表示しなければなりません。
1272 継承 — もしあなたがこの作品を改変、変形または加工した場合、
1273 あなたはその結果生じた作品をこの作品と同一の許諾条件の下でのみ
1274 頒布することができます。`
1275
1276 // http://creativecommons.org/licenses/by-sa/2.5/cn/
1277 const txt_cn = `您可以自由： 复制、发行、展览、表演、放映、
1278 广播或通过信息网络传播本作品 创作演绎作品
1279 对本作品进行商业性使用 惟须遵守下列条件：
1280 署名 — 您必须按照作者或者许可人指定的方式对作品进行署名。
1281 相同方式共享 — 如果您改变、转换本作品或者以本作品为基础进行创作，
1282 您只能采用与本协议相同的许可协议发布基于本作品的演绎作品。`
1283
1284 const txt_cjk = txt_cn + txt_jp + txt_kr
1285 const txt_all = txt_vn + twoByteUtf8 + threeByteUtf8 + txt_cjk
1286
1287 var txt_all_bytes = []byte(txt_all)