vendor/golang.org/x/text/cases/map.go

   1 // Copyright 2014 The Go Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style
   3 // license that can be found in the LICENSE file.
   4
   5 package cases
   6
   7 // This file contains the definitions of case mappings for all supported
   8 // languages. The rules for the language-specific tailorings were taken and
   9 // modified from the CLDR transform definitions in common/transforms.
  10
  11 import (
  12         "strings"
  13         "unicode"
  14         "unicode/utf8"
  15
  16         "golang.org/x/text/internal"
  17         "golang.org/x/text/language"
  18         "golang.org/x/text/transform"
  19         "golang.org/x/text/unicode/norm"
  20 )
  21
  22 // A mapFunc takes a context set to the current rune and writes the mapped
  23 // version to the same context. It may advance the context to the next rune. It
  24 // returns whether a checkpoint is possible: whether the pDst bytes written to
  25 // dst so far won't need changing as we see more source bytes.
  26 type mapFunc func(*context) bool
  27
  28 // A spanFunc takes a context set to the current rune and returns whether this
  29 // rune would be altered when written to the output. It may advance the context
  30 // to the next rune. It returns whether a checkpoint is possible.
  31 type spanFunc func(*context) bool
  32
  33 // maxIgnorable defines the maximum number of ignorables to consider for
  34 // lookahead operations.
  35 const maxIgnorable = 30
  36
  37 // supported lists the language tags for which we have tailorings.
  38 const supported = "und af az el lt nl tr"
  39
  40 func init() {
  41         tags := []language.Tag{}
  42         for _, s := range strings.Split(supported, " ") {
  43                 tags = append(tags, language.MustParse(s))
  44         }
  45         matcher = internal.NewInheritanceMatcher(tags)
  46         Supported = language.NewCoverage(tags)
  47 }
  48
  49 var (
  50         matcher *internal.InheritanceMatcher
  51
  52         Supported language.Coverage
  53
  54         // We keep the following lists separate, instead of having a single per-
  55         // language struct, to give the compiler a chance to remove unused code.
  56
  57         // Some uppercase mappers are stateless, so we can precompute the
  58         // Transformers and save a bit on runtime allocations.
  59         upperFunc = []struct {
  60                 upper mapFunc
  61                 span  spanFunc
  62         }{
  63                 {nil, nil},                  // und
  64                 {nil, nil},                  // af
  65                 {aztrUpper(upper), isUpper}, // az
  66                 {elUpper, noSpan},           // el
  67                 {ltUpper(upper), noSpan},    // lt
  68                 {nil, nil},                  // nl
  69                 {aztrUpper(upper), isUpper}, // tr
  70         }
  71
  72         undUpper            transform.SpanningTransformer = &undUpperCaser{}
  73         undLower            transform.SpanningTransformer = &undLowerCaser{}
  74         undLowerIgnoreSigma transform.SpanningTransformer = &undLowerIgnoreSigmaCaser{}
  75
  76         lowerFunc = []mapFunc{
  77                 nil,       // und
  78                 nil,       // af
  79                 aztrLower, // az
  80                 nil,       // el
  81                 ltLower,   // lt
  82                 nil,       // nl
  83                 aztrLower, // tr
  84         }
  85
  86         titleInfos = []struct {
  87                 title     mapFunc
  88                 lower     mapFunc
  89                 titleSpan spanFunc
  90                 rewrite   func(*context)
  91         }{
  92                 {title, lower, isTitle, nil},                // und
  93                 {title, lower, isTitle, afnlRewrite},        // af
  94                 {aztrUpper(title), aztrLower, isTitle, nil}, // az
  95                 {title, lower, isTitle, nil},                // el
  96                 {ltUpper(title), ltLower, noSpan, nil},      // lt
  97                 {nlTitle, lower, nlTitleSpan, afnlRewrite},  // nl
  98                 {aztrUpper(title), aztrLower, isTitle, nil}, // tr
  99         }
 100 )
 101
 102 func makeUpper(t language.Tag, o options) transform.SpanningTransformer {
 103         _, i, _ := matcher.Match(t)
 104         f := upperFunc[i].upper
 105         if f == nil {
 106                 return undUpper
 107         }
 108         return &simpleCaser{f: f, span: upperFunc[i].span}
 109 }
 110
 111 func makeLower(t language.Tag, o options) transform.SpanningTransformer {
 112         _, i, _ := matcher.Match(t)
 113         f := lowerFunc[i]
 114         if f == nil {
 115                 if o.ignoreFinalSigma {
 116                         return undLowerIgnoreSigma
 117                 }
 118                 return undLower
 119         }
 120         if o.ignoreFinalSigma {
 121                 return &simpleCaser{f: f, span: isLower}
 122         }
 123         return &lowerCaser{
 124                 first:   f,
 125                 midWord: finalSigma(f),
 126         }
 127 }
 128
 129 func makeTitle(t language.Tag, o options) transform.SpanningTransformer {
 130         _, i, _ := matcher.Match(t)
 131         x := &titleInfos[i]
 132         lower := x.lower
 133         if o.noLower {
 134                 lower = (*context).copy
 135         } else if !o.ignoreFinalSigma {
 136                 lower = finalSigma(lower)
 137         }
 138         return &titleCaser{
 139                 title:     x.title,
 140                 lower:     lower,
 141                 titleSpan: x.titleSpan,
 142                 rewrite:   x.rewrite,
 143         }
 144 }
 145
 146 func noSpan(c *context) bool {
 147         c.err = transform.ErrEndOfSpan
 148         return false
 149 }
 150
 151 // TODO: consider a similar special case for the fast majority lower case. This
 152 // is a bit more involved so will require some more precise benchmarking to
 153 // justify it.
 154
 155 type undUpperCaser struct{ transform.NopResetter }
 156
 157 // undUpperCaser implements the Transformer interface for doing an upper case
 158 // mapping for the root locale (und). It eliminates the need for an allocation
 159 // as it prevents escaping by not using function pointers.
 160 func (t undUpperCaser) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
 161         c := context{dst: dst, src: src, atEOF: atEOF}
 162         for c.next() {
 163                 upper(&c)
 164                 c.checkpoint()
 165         }
 166         return c.ret()
 167 }
 168
 169 func (t undUpperCaser) Span(src []byte, atEOF bool) (n int, err error) {
 170         c := context{src: src, atEOF: atEOF}
 171         for c.next() && isUpper(&c) {
 172                 c.checkpoint()
 173         }
 174         return c.retSpan()
 175 }
 176
 177 // undLowerIgnoreSigmaCaser implements the Transformer interface for doing
 178 // a lower case mapping for the root locale (und) ignoring final sigma
 179 // handling. This casing algorithm is used in some performance-critical packages
 180 // like secure/precis and x/net/http/idna, which warrants its special-casing.
 181 type undLowerIgnoreSigmaCaser struct{ transform.NopResetter }
 182
 183 func (t undLowerIgnoreSigmaCaser) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
 184         c := context{dst: dst, src: src, atEOF: atEOF}
 185         for c.next() && lower(&c) {
 186                 c.checkpoint()
 187         }
 188         return c.ret()
 189
 190 }
 191
 192 // Span implements a generic lower-casing. This is possible as isLower works
 193 // for all lowercasing variants. All lowercase variants only vary in how they
 194 // transform a non-lowercase letter. They will never change an already lowercase
 195 // letter. In addition, there is no state.
 196 func (t undLowerIgnoreSigmaCaser) Span(src []byte, atEOF bool) (n int, err error) {
 197         c := context{src: src, atEOF: atEOF}
 198         for c.next() && isLower(&c) {
 199                 c.checkpoint()
 200         }
 201         return c.retSpan()
 202 }
 203
 204 type simpleCaser struct {
 205         context
 206         f    mapFunc
 207         span spanFunc
 208 }
 209
 210 // simpleCaser implements the Transformer interface for doing a case operation
 211 // on a rune-by-rune basis.
 212 func (t *simpleCaser) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
 213         c := context{dst: dst, src: src, atEOF: atEOF}
 214         for c.next() && t.f(&c) {
 215                 c.checkpoint()
 216         }
 217         return c.ret()
 218 }
 219
 220 func (t *simpleCaser) Span(src []byte, atEOF bool) (n int, err error) {
 221         c := context{src: src, atEOF: atEOF}
 222         for c.next() && t.span(&c) {
 223                 c.checkpoint()
 224         }
 225         return c.retSpan()
 226 }
 227
 228 // undLowerCaser implements the Transformer interface for doing a lower case
 229 // mapping for the root locale (und) ignoring final sigma handling. This casing
 230 // algorithm is used in some performance-critical packages like secure/precis
 231 // and x/net/http/idna, which warrants its special-casing.
 232 type undLowerCaser struct{ transform.NopResetter }
 233
 234 func (t undLowerCaser) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
 235         c := context{dst: dst, src: src, atEOF: atEOF}
 236
 237         for isInterWord := true; c.next(); {
 238                 if isInterWord {
 239                         if c.info.isCased() {
 240                                 if !lower(&c) {
 241                                         break
 242                                 }
 243                                 isInterWord = false
 244                         } else if !c.copy() {
 245                                 break
 246                         }
 247                 } else {
 248                         if c.info.isNotCasedAndNotCaseIgnorable() {
 249                                 if !c.copy() {
 250                                         break
 251                                 }
 252                                 isInterWord = true
 253                         } else if !c.hasPrefix("Σ") {
 254                                 if !lower(&c) {
 255                                         break
 256                                 }
 257                         } else if !finalSigmaBody(&c) {
 258                                 break
 259                         }
 260                 }
 261                 c.checkpoint()
 262         }
 263         return c.ret()
 264 }
 265
 266 func (t undLowerCaser) Span(src []byte, atEOF bool) (n int, err error) {
 267         c := context{src: src, atEOF: atEOF}
 268         for c.next() && isLower(&c) {
 269                 c.checkpoint()
 270         }
 271         return c.retSpan()
 272 }
 273
 274 // lowerCaser implements the Transformer interface. The default Unicode lower
 275 // casing requires different treatment for the first and subsequent characters
 276 // of a word, most notably to handle the Greek final Sigma.
 277 type lowerCaser struct {
 278         undLowerIgnoreSigmaCaser
 279
 280         context
 281
 282         first, midWord mapFunc
 283 }
 284
 285 func (t *lowerCaser) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
 286         t.context = context{dst: dst, src: src, atEOF: atEOF}
 287         c := &t.context
 288
 289         for isInterWord := true; c.next(); {
 290                 if isInterWord {
 291                         if c.info.isCased() {
 292                                 if !t.first(c) {
 293                                         break
 294                                 }
 295                                 isInterWord = false
 296                         } else if !c.copy() {
 297                                 break
 298                         }
 299                 } else {
 300                         if c.info.isNotCasedAndNotCaseIgnorable() {
 301                                 if !c.copy() {
 302                                         break
 303                                 }
 304                                 isInterWord = true
 305                         } else if !t.midWord(c) {
 306                                 break
 307                         }
 308                 }
 309                 c.checkpoint()
 310         }
 311         return c.ret()
 312 }
 313
 314 // titleCaser implements the Transformer interface. Title casing algorithms
 315 // distinguish between the first letter of a word and subsequent letters of the
 316 // same word. It uses state to avoid requiring a potentially infinite lookahead.
 317 type titleCaser struct {
 318         context
 319
 320         // rune mappings used by the actual casing algorithms.
 321         title     mapFunc
 322         lower     mapFunc
 323         titleSpan spanFunc
 324
 325         rewrite func(*context)
 326 }
 327
 328 // Transform implements the standard Unicode title case algorithm as defined in
 329 // Chapter 3 of The Unicode Standard:
 330 // toTitlecase(X): Find the word boundaries in X according to Unicode Standard
 331 // Annex #29, "Unicode Text Segmentation." For each word boundary, find the
 332 // first cased character F following the word boundary. If F exists, map F to
 333 // Titlecase_Mapping(F); then map all characters C between F and the following
 334 // word boundary to Lowercase_Mapping(C).
 335 func (t *titleCaser) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
 336         t.context = context{dst: dst, src: src, atEOF: atEOF, isMidWord: t.isMidWord}
 337         c := &t.context
 338
 339         if !c.next() {
 340                 return c.ret()
 341         }
 342
 343         for {
 344                 p := c.info
 345                 if t.rewrite != nil {
 346                         t.rewrite(c)
 347                 }
 348
 349                 wasMid := p.isMid()
 350                 // Break out of this loop on failure to ensure we do not modify the
 351                 // state incorrectly.
 352                 if p.isCased() {
 353                         if !c.isMidWord {
 354                                 if !t.title(c) {
 355                                         break
 356                                 }
 357                                 c.isMidWord = true
 358                         } else if !t.lower(c) {
 359                                 break
 360                         }
 361                 } else if !c.copy() {
 362                         break
 363                 } else if p.isBreak() {
 364                         c.isMidWord = false
 365                 }
 366
 367                 // As we save the state of the transformer, it is safe to call
 368                 // checkpoint after any successful write.
 369                 if !(c.isMidWord && wasMid) {
 370                         c.checkpoint()
 371                 }
 372
 373                 if !c.next() {
 374                         break
 375                 }
 376                 if wasMid && c.info.isMid() {
 377                         c.isMidWord = false
 378                 }
 379         }
 380         return c.ret()
 381 }
 382
 383 func (t *titleCaser) Span(src []byte, atEOF bool) (n int, err error) {
 384         t.context = context{src: src, atEOF: atEOF, isMidWord: t.isMidWord}
 385         c := &t.context
 386
 387         if !c.next() {
 388                 return c.retSpan()
 389         }
 390
 391         for {
 392                 p := c.info
 393                 if t.rewrite != nil {
 394                         t.rewrite(c)
 395                 }
 396
 397                 wasMid := p.isMid()
 398                 // Break out of this loop on failure to ensure we do not modify the
 399                 // state incorrectly.
 400                 if p.isCased() {
 401                         if !c.isMidWord {
 402                                 if !t.titleSpan(c) {
 403                                         break
 404                                 }
 405                                 c.isMidWord = true
 406                         } else if !isLower(c) {
 407                                 break
 408                         }
 409                 } else if p.isBreak() {
 410                         c.isMidWord = false
 411                 }
 412                 // As we save the state of the transformer, it is safe to call
 413                 // checkpoint after any successful write.
 414                 if !(c.isMidWord && wasMid) {
 415                         c.checkpoint()
 416                 }
 417
 418                 if !c.next() {
 419                         break
 420                 }
 421                 if wasMid && c.info.isMid() {
 422                         c.isMidWord = false
 423                 }
 424         }
 425         return c.retSpan()
 426 }
 427
 428 // finalSigma adds Greek final Sigma handing to another casing function. It
 429 // determines whether a lowercased sigma should be σ or ς, by looking ahead for
 430 // case-ignorables and a cased letters.
 431 func finalSigma(f mapFunc) mapFunc {
 432         return func(c *context) bool {
 433                 if !c.hasPrefix("Σ") {
 434                         return f(c)
 435                 }
 436                 return finalSigmaBody(c)
 437         }
 438 }
 439
 440 func finalSigmaBody(c *context) bool {
 441         // Current rune must be ∑.
 442
 443         // ::NFD();
 444         // # 03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA
 445         // Σ } [:case-ignorable:]* [:cased:] → σ;
 446         // [:cased:] [:case-ignorable:]* { Σ → ς;
 447         // ::Any-Lower;
 448         // ::NFC();
 449
 450         p := c.pDst
 451         c.writeString("ς")
 452
 453         // TODO: we should do this here, but right now this will never have an
 454         // effect as this is called when the prefix is Sigma, whereas Dutch and
 455         // Afrikaans only test for an apostrophe.
 456         //
 457         // if t.rewrite != nil {
 458         //      t.rewrite(c)
 459         // }
 460
 461         // We need to do one more iteration after maxIgnorable, as a cased
 462         // letter is not an ignorable and may modify the result.
 463         wasMid := false
 464         for i := 0; i < maxIgnorable+1; i++ {
 465                 if !c.next() {
 466                         return false
 467                 }
 468                 if !c.info.isCaseIgnorable() {
 469                         // All Midword runes are also case ignorable, so we are
 470                         // guaranteed to have a letter or word break here. As we are
 471                         // unreading the run, there is no need to unset c.isMidWord;
 472                         // the title caser will handle this.
 473                         if c.info.isCased() {
 474                                 // p+1 is guaranteed to be in bounds: if writing ς was
 475                                 // successful, p+1 will contain the second byte of ς. If not,
 476                                 // this function will have returned after c.next returned false.
 477                                 c.dst[p+1]++ // ς → σ
 478                         }
 479                         c.unreadRune()
 480                         return true
 481                 }
 482                 // A case ignorable may also introduce a word break, so we may need
 483                 // to continue searching even after detecting a break.
 484                 isMid := c.info.isMid()
 485                 if (wasMid && isMid) || c.info.isBreak() {
 486                         c.isMidWord = false
 487                 }
 488                 wasMid = isMid
 489                 c.copy()
 490         }
 491         return true
 492 }
 493
 494 // finalSigmaSpan would be the same as isLower.
 495
 496 // elUpper implements Greek upper casing, which entails removing a predefined
 497 // set of non-blocked modifiers. Note that these accents should not be removed
 498 // for title casing!
 499 // Example: "Οδός" -> "ΟΔΟΣ".
 500 func elUpper(c *context) bool {
 501         // From CLDR:
 502         // [:Greek:] [^[:ccc=Not_Reordered:][:ccc=Above:]]*? { [\u0313\u0314\u0301\u0300\u0306\u0342\u0308\u0304] → ;
 503         // [:Greek:] [^[:ccc=Not_Reordered:][:ccc=Iota_Subscript:]]*? { \u0345 → ;
 504
 505         r, _ := utf8.DecodeRune(c.src[c.pSrc:])
 506         oldPDst := c.pDst
 507         if !upper(c) {
 508                 return false
 509         }
 510         if !unicode.Is(unicode.Greek, r) {
 511                 return true
 512         }
 513         i := 0
 514         // Take the properties of the uppercased rune that is already written to the
 515         // destination. This saves us the trouble of having to uppercase the
 516         // decomposed rune again.
 517         if b := norm.NFD.Properties(c.dst[oldPDst:]).Decomposition(); b != nil {
 518                 // Restore the destination position and process the decomposed rune.
 519                 r, sz := utf8.DecodeRune(b)
 520                 if r <= 0xFF { // See A.6.1
 521                         return true
 522                 }
 523                 c.pDst = oldPDst
 524                 // Insert the first rune and ignore the modifiers. See A.6.2.
 525                 c.writeBytes(b[:sz])
 526                 i = len(b[sz:]) / 2 // Greek modifiers are always of length 2.
 527         }
 528
 529         for ; i < maxIgnorable && c.next(); i++ {
 530                 switch r, _ := utf8.DecodeRune(c.src[c.pSrc:]); r {
 531                 // Above and Iota Subscript
 532                 case 0x0300, // U+0300 COMBINING GRAVE ACCENT
 533                         0x0301, // U+0301 COMBINING ACUTE ACCENT
 534                         0x0304, // U+0304 COMBINING MACRON
 535                         0x0306, // U+0306 COMBINING BREVE
 536                         0x0308, // U+0308 COMBINING DIAERESIS
 537                         0x0313, // U+0313 COMBINING COMMA ABOVE
 538                         0x0314, // U+0314 COMBINING REVERSED COMMA ABOVE
 539                         0x0342, // U+0342 COMBINING GREEK PERISPOMENI
 540                         0x0345: // U+0345 COMBINING GREEK YPOGEGRAMMENI
 541                         // No-op. Gobble the modifier.
 542
 543                 default:
 544                         switch v, _ := trie.lookup(c.src[c.pSrc:]); info(v).cccType() {
 545                         case cccZero:
 546                                 c.unreadRune()
 547                                 return true
 548
 549                         // We don't need to test for IotaSubscript as the only rune that
 550                         // qualifies (U+0345) was already excluded in the switch statement
 551                         // above. See A.4.
 552
 553                         case cccAbove:
 554                                 return c.copy()
 555                         default:
 556                                 // Some other modifier. We're still allowed to gobble Greek
 557                                 // modifiers after this.
 558                                 c.copy()
 559                         }
 560                 }
 561         }
 562         return i == maxIgnorable
 563 }
 564
 565 // TODO: implement elUpperSpan (low-priority: complex and infrequent).
 566
 567 func ltLower(c *context) bool {
 568         // From CLDR:
 569         // # Introduce an explicit dot above when lowercasing capital I's and J's
 570         // # whenever there are more accents above.
 571         // # (of the accents used in Lithuanian: grave, acute, tilde above, and ogonek)
 572         // # 0049; 0069 0307; 0049; 0049; lt More_Above; # LATIN CAPITAL LETTER I
 573         // # 004A; 006A 0307; 004A; 004A; lt More_Above; # LATIN CAPITAL LETTER J
 574         // # 012E; 012F 0307; 012E; 012E; lt More_Above; # LATIN CAPITAL LETTER I WITH OGONEK
 575         // # 00CC; 0069 0307 0300; 00CC; 00CC; lt; # LATIN CAPITAL LETTER I WITH GRAVE
 576         // # 00CD; 0069 0307 0301; 00CD; 00CD; lt; # LATIN CAPITAL LETTER I WITH ACUTE
 577         // # 0128; 0069 0307 0303; 0128; 0128; lt; # LATIN CAPITAL LETTER I WITH TILDE
 578         // ::NFD();
 579         // I } [^[:ccc=Not_Reordered:][:ccc=Above:]]* [:ccc=Above:] → i \u0307;
 580         // J } [^[:ccc=Not_Reordered:][:ccc=Above:]]* [:ccc=Above:] → j \u0307;
 581         // I \u0328 (Į) } [^[:ccc=Not_Reordered:][:ccc=Above:]]* [:ccc=Above:] → i \u0328 \u0307;
 582         // I \u0300 (Ì) → i \u0307 \u0300;
 583         // I \u0301 (Í) → i \u0307 \u0301;
 584         // I \u0303 (Ĩ) → i \u0307 \u0303;
 585         // ::Any-Lower();
 586         // ::NFC();
 587
 588         i := 0
 589         if r := c.src[c.pSrc]; r < utf8.RuneSelf {
 590                 lower(c)
 591                 if r != 'I' && r != 'J' {
 592                         return true
 593                 }
 594         } else {
 595                 p := norm.NFD.Properties(c.src[c.pSrc:])
 596                 if d := p.Decomposition(); len(d) >= 3 && (d[0] == 'I' || d[0] == 'J') {
 597                         // UTF-8 optimization: the decomposition will only have an above
 598                         // modifier if the last rune of the decomposition is in [U+300-U+311].
 599                         // In all other cases, a decomposition starting with I is always
 600                         // an I followed by modifiers that are not cased themselves. See A.2.
 601                         if d[1] == 0xCC && d[2] <= 0x91 { // A.2.4.
 602                                 if !c.writeBytes(d[:1]) {
 603                                         return false
 604                                 }
 605                                 c.dst[c.pDst-1] += 'a' - 'A' // lower
 606
 607                                 // Assumption: modifier never changes on lowercase. See A.1.
 608                                 // Assumption: all modifiers added have CCC = Above. See A.2.3.
 609                                 return c.writeString("\u0307") && c.writeBytes(d[1:])
 610                         }
 611                         // In all other cases the additional modifiers will have a CCC
 612                         // that is less than 230 (Above). We will insert the U+0307, if
 613                         // needed, after these modifiers so that a string in FCD form
 614                         // will remain so. See A.2.2.
 615                         lower(c)
 616                         i = 1
 617                 } else {
 618                         return lower(c)
 619                 }
 620         }
 621
 622         for ; i < maxIgnorable && c.next(); i++ {
 623                 switch c.info.cccType() {
 624                 case cccZero:
 625                         c.unreadRune()
 626                         return true
 627                 case cccAbove:
 628                         return c.writeString("\u0307") && c.copy() // See A.1.
 629                 default:
 630                         c.copy() // See A.1.
 631                 }
 632         }
 633         return i == maxIgnorable
 634 }
 635
 636 // ltLowerSpan would be the same as isLower.
 637
 638 func ltUpper(f mapFunc) mapFunc {
 639         return func(c *context) bool {
 640                 // Unicode:
 641                 // 0307; 0307; ; ; lt After_Soft_Dotted; # COMBINING DOT ABOVE
 642                 //
 643                 // From CLDR:
 644                 // # Remove \u0307 following soft-dotteds (i, j, and the like), with possible
 645                 // # intervening non-230 marks.
 646                 // ::NFD();
 647                 // [:Soft_Dotted:] [^[:ccc=Not_Reordered:][:ccc=Above:]]* { \u0307 → ;
 648                 // ::Any-Upper();
 649                 // ::NFC();
 650
 651                 // TODO: See A.5. A soft-dotted rune never has an exception. This would
 652                 // allow us to overload the exception bit and encode this property in
 653                 // info. Need to measure performance impact of this.
 654                 r, _ := utf8.DecodeRune(c.src[c.pSrc:])
 655                 oldPDst := c.pDst
 656                 if !f(c) {
 657                         return false
 658                 }
 659                 if !unicode.Is(unicode.Soft_Dotted, r) {
 660                         return true
 661                 }
 662
 663                 // We don't need to do an NFD normalization, as a soft-dotted rune never
 664                 // contains U+0307. See A.3.
 665
 666                 i := 0
 667                 for ; i < maxIgnorable && c.next(); i++ {
 668                         switch c.info.cccType() {
 669                         case cccZero:
 670                                 c.unreadRune()
 671                                 return true
 672                         case cccAbove:
 673                                 if c.hasPrefix("\u0307") {
 674                                         // We don't do a full NFC, but rather combine runes for
 675                                         // some of the common cases. (Returning NFC or
 676                                         // preserving normal form is neither a requirement nor
 677                                         // a possibility anyway).
 678                                         if !c.next() {
 679                                                 return false
 680                                         }
 681                                         if c.dst[oldPDst] == 'I' && c.pDst == oldPDst+1 && c.src[c.pSrc] == 0xcc {
 682                                                 s := ""
 683                                                 switch c.src[c.pSrc+1] {
 684                                                 case 0x80: // U+0300 COMBINING GRAVE ACCENT
 685                                                         s = "\u00cc" // U+00CC LATIN CAPITAL LETTER I WITH GRAVE
 686                                                 case 0x81: // U+0301 COMBINING ACUTE ACCENT
 687                                                         s = "\u00cd" // U+00CD LATIN CAPITAL LETTER I WITH ACUTE
 688                                                 case 0x83: // U+0303 COMBINING TILDE
 689                                                         s = "\u0128" // U+0128 LATIN CAPITAL LETTER I WITH TILDE
 690                                                 case 0x88: // U+0308 COMBINING DIAERESIS
 691                                                         s = "\u00cf" // U+00CF LATIN CAPITAL LETTER I WITH DIAERESIS
 692                                                 default:
 693                                                 }
 694                                                 if s != "" {
 695                                                         c.pDst = oldPDst
 696                                                         return c.writeString(s)
 697                                                 }
 698                                         }
 699                                 }
 700                                 return c.copy()
 701                         default:
 702                                 c.copy()
 703                         }
 704                 }
 705                 return i == maxIgnorable
 706         }
 707 }
 708
 709 // TODO: implement ltUpperSpan (low priority: complex and infrequent).
 710
 711 func aztrUpper(f mapFunc) mapFunc {
 712         return func(c *context) bool {
 713                 // i→İ;
 714                 if c.src[c.pSrc] == 'i' {
 715                         return c.writeString("İ")
 716                 }
 717                 return f(c)
 718         }
 719 }
 720
 721 func aztrLower(c *context) (done bool) {
 722         // From CLDR:
 723         // # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
 724         // # 0130; 0069; 0130; 0130; tr; # LATIN CAPITAL LETTER I WITH DOT ABOVE
 725         // İ→i;
 726         // # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
 727         // # This matches the behavior of the canonically equivalent I-dot_above
 728         // # 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
 729         // # When lowercasing, unless an I is before a dot_above, it turns into a dotless i.
 730         // # 0049; 0131; 0049; 0049; tr Not_Before_Dot; # LATIN CAPITAL LETTER I
 731         // I([^[:ccc=Not_Reordered:][:ccc=Above:]]*)\u0307 → i$1 ;
 732         // I→ı ;
 733         // ::Any-Lower();
 734         if c.hasPrefix("\u0130") { // İ
 735                 return c.writeString("i")
 736         }
 737         if c.src[c.pSrc] != 'I' {
 738                 return lower(c)
 739         }
 740
 741         // We ignore the lower-case I for now, but insert it later when we know
 742         // which form we need.
 743         start := c.pSrc + c.sz
 744
 745         i := 0
 746 Loop:
 747         // We check for up to n ignorables before \u0307. As \u0307 is an
 748         // ignorable as well, n is maxIgnorable-1.
 749         for ; i < maxIgnorable && c.next(); i++ {
 750                 switch c.info.cccType() {
 751                 case cccAbove:
 752                         if c.hasPrefix("\u0307") {
 753                                 return c.writeString("i") && c.writeBytes(c.src[start:c.pSrc]) // ignore U+0307
 754                         }
 755                         done = true
 756                         break Loop
 757                 case cccZero:
 758                         c.unreadRune()
 759                         done = true
 760                         break Loop
 761                 default:
 762                         // We'll write this rune after we know which starter to use.
 763                 }
 764         }
 765         if i == maxIgnorable {
 766                 done = true
 767         }
 768         return c.writeString("ı") && c.writeBytes(c.src[start:c.pSrc+c.sz]) && done
 769 }
 770
 771 // aztrLowerSpan would be the same as isLower.
 772
 773 func nlTitle(c *context) bool {
 774         // From CLDR:
 775         // # Special titlecasing for Dutch initial "ij".
 776         // ::Any-Title();
 777         // # Fix up Ij at the beginning of a "word" (per Any-Title, notUAX #29)
 778         // [:^WB=ALetter:] [:WB=Extend:]* [[:WB=MidLetter:][:WB=MidNumLet:]]? { Ij } → IJ ;
 779         if c.src[c.pSrc] != 'I' && c.src[c.pSrc] != 'i' {
 780                 return title(c)
 781         }
 782
 783         if !c.writeString("I") || !c.next() {
 784                 return false
 785         }
 786         if c.src[c.pSrc] == 'j' || c.src[c.pSrc] == 'J' {
 787                 return c.writeString("J")
 788         }
 789         c.unreadRune()
 790         return true
 791 }
 792
 793 func nlTitleSpan(c *context) bool {
 794         // From CLDR:
 795         // # Special titlecasing for Dutch initial "ij".
 796         // ::Any-Title();
 797         // # Fix up Ij at the beginning of a "word" (per Any-Title, notUAX #29)
 798         // [:^WB=ALetter:] [:WB=Extend:]* [[:WB=MidLetter:][:WB=MidNumLet:]]? { Ij } → IJ ;
 799         if c.src[c.pSrc] != 'I' {
 800                 return isTitle(c)
 801         }
 802         if !c.next() || c.src[c.pSrc] == 'j' {
 803                 return false
 804         }
 805         if c.src[c.pSrc] != 'J' {
 806                 c.unreadRune()
 807         }
 808         return true
 809 }
 810
 811 // Not part of CLDR, but see http://unicode.org/cldr/trac/ticket/7078.
 812 func afnlRewrite(c *context) {
 813         if c.hasPrefix("'") || c.hasPrefix("’") {
 814                 c.isMidWord = true
 815         }
 816 }