vendor/golang.org/x/text/internal/colltab/table.go

   1 // Copyright 2012 The Go Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style
   3 // license that can be found in the LICENSE file.
   4
   5 package colltab
   6
   7 import (
   8         "unicode/utf8"
   9
  10         "golang.org/x/text/unicode/norm"
  11 )
  12
  13 // Table holds all collation data for a given collation ordering.
  14 type Table struct {
  15         Index Trie // main trie
  16
  17         // expansion info
  18         ExpandElem []uint32
  19
  20         // contraction info
  21         ContractTries  ContractTrieSet
  22         ContractElem   []uint32
  23         MaxContractLen int
  24         VariableTop    uint32
  25 }
  26
  27 func (t *Table) AppendNext(w []Elem, b []byte) (res []Elem, n int) {
  28         return t.appendNext(w, source{bytes: b})
  29 }
  30
  31 func (t *Table) AppendNextString(w []Elem, s string) (res []Elem, n int) {
  32         return t.appendNext(w, source{str: s})
  33 }
  34
  35 func (t *Table) Start(p int, b []byte) int {
  36         // TODO: implement
  37         panic("not implemented")
  38 }
  39
  40 func (t *Table) StartString(p int, s string) int {
  41         // TODO: implement
  42         panic("not implemented")
  43 }
  44
  45 func (t *Table) Domain() []string {
  46         // TODO: implement
  47         panic("not implemented")
  48 }
  49
  50 func (t *Table) Top() uint32 {
  51         return t.VariableTop
  52 }
  53
  54 type source struct {
  55         str   string
  56         bytes []byte
  57 }
  58
  59 func (src *source) lookup(t *Table) (ce Elem, sz int) {
  60         if src.bytes == nil {
  61                 return t.Index.lookupString(src.str)
  62         }
  63         return t.Index.lookup(src.bytes)
  64 }
  65
  66 func (src *source) tail(sz int) {
  67         if src.bytes == nil {
  68                 src.str = src.str[sz:]
  69         } else {
  70                 src.bytes = src.bytes[sz:]
  71         }
  72 }
  73
  74 func (src *source) nfd(buf []byte, end int) []byte {
  75         if src.bytes == nil {
  76                 return norm.NFD.AppendString(buf[:0], src.str[:end])
  77         }
  78         return norm.NFD.Append(buf[:0], src.bytes[:end]...)
  79 }
  80
  81 func (src *source) rune() (r rune, sz int) {
  82         if src.bytes == nil {
  83                 return utf8.DecodeRuneInString(src.str)
  84         }
  85         return utf8.DecodeRune(src.bytes)
  86 }
  87
  88 func (src *source) properties(f norm.Form) norm.Properties {
  89         if src.bytes == nil {
  90                 return f.PropertiesString(src.str)
  91         }
  92         return f.Properties(src.bytes)
  93 }
  94
  95 // appendNext appends the weights corresponding to the next rune or
  96 // contraction in s.  If a contraction is matched to a discontinuous
  97 // sequence of runes, the weights for the interstitial runes are
  98 // appended as well.  It returns a new slice that includes the appended
  99 // weights and the number of bytes consumed from s.
 100 func (t *Table) appendNext(w []Elem, src source) (res []Elem, n int) {
 101         ce, sz := src.lookup(t)
 102         tp := ce.ctype()
 103         if tp == ceNormal {
 104                 if ce == 0 {
 105                         r, _ := src.rune()
 106                         const (
 107                                 hangulSize  = 3
 108                                 firstHangul = 0xAC00
 109                                 lastHangul  = 0xD7A3
 110                         )
 111                         if r >= firstHangul && r <= lastHangul {
 112                                 // TODO: performance can be considerably improved here.
 113                                 n = sz
 114                                 var buf [16]byte // Used for decomposing Hangul.
 115                                 for b := src.nfd(buf[:0], hangulSize); len(b) > 0; b = b[sz:] {
 116                                         ce, sz = t.Index.lookup(b)
 117                                         w = append(w, ce)
 118                                 }
 119                                 return w, n
 120                         }
 121                         ce = makeImplicitCE(implicitPrimary(r))
 122                 }
 123                 w = append(w, ce)
 124         } else if tp == ceExpansionIndex {
 125                 w = t.appendExpansion(w, ce)
 126         } else if tp == ceContractionIndex {
 127                 n := 0
 128                 src.tail(sz)
 129                 if src.bytes == nil {
 130                         w, n = t.matchContractionString(w, ce, src.str)
 131                 } else {
 132                         w, n = t.matchContraction(w, ce, src.bytes)
 133                 }
 134                 sz += n
 135         } else if tp == ceDecompose {
 136                 // Decompose using NFKD and replace tertiary weights.
 137                 t1, t2 := splitDecompose(ce)
 138                 i := len(w)
 139                 nfkd := src.properties(norm.NFKD).Decomposition()
 140                 for p := 0; len(nfkd) > 0; nfkd = nfkd[p:] {
 141                         w, p = t.appendNext(w, source{bytes: nfkd})
 142                 }
 143                 w[i] = w[i].updateTertiary(t1)
 144                 if i++; i < len(w) {
 145                         w[i] = w[i].updateTertiary(t2)
 146                         for i++; i < len(w); i++ {
 147                                 w[i] = w[i].updateTertiary(maxTertiary)
 148                         }
 149                 }
 150         }
 151         return w, sz
 152 }
 153
 154 func (t *Table) appendExpansion(w []Elem, ce Elem) []Elem {
 155         i := splitExpandIndex(ce)
 156         n := int(t.ExpandElem[i])
 157         i++
 158         for _, ce := range t.ExpandElem[i : i+n] {
 159                 w = append(w, Elem(ce))
 160         }
 161         return w
 162 }
 163
 164 func (t *Table) matchContraction(w []Elem, ce Elem, suffix []byte) ([]Elem, int) {
 165         index, n, offset := splitContractIndex(ce)
 166
 167         scan := t.ContractTries.scanner(index, n, suffix)
 168         buf := [norm.MaxSegmentSize]byte{}
 169         bufp := 0
 170         p := scan.scan(0)
 171
 172         if !scan.done && p < len(suffix) && suffix[p] >= utf8.RuneSelf {
 173                 // By now we should have filtered most cases.
 174                 p0 := p
 175                 bufn := 0
 176                 rune := norm.NFD.Properties(suffix[p:])
 177                 p += rune.Size()
 178                 if rune.LeadCCC() != 0 {
 179                         prevCC := rune.TrailCCC()
 180                         // A gap may only occur in the last normalization segment.
 181                         // This also ensures that len(scan.s) < norm.MaxSegmentSize.
 182                         if end := norm.NFD.FirstBoundary(suffix[p:]); end != -1 {
 183                                 scan.s = suffix[:p+end]
 184                         }
 185                         for p < len(suffix) && !scan.done && suffix[p] >= utf8.RuneSelf {
 186                                 rune = norm.NFD.Properties(suffix[p:])
 187                                 if ccc := rune.LeadCCC(); ccc == 0 || prevCC >= ccc {
 188                                         break
 189                                 }
 190                                 prevCC = rune.TrailCCC()
 191                                 if pp := scan.scan(p); pp != p {
 192                                         // Copy the interstitial runes for later processing.
 193                                         bufn += copy(buf[bufn:], suffix[p0:p])
 194                                         if scan.pindex == pp {
 195                                                 bufp = bufn
 196                                         }
 197                                         p, p0 = pp, pp
 198                                 } else {
 199                                         p += rune.Size()
 200                                 }
 201                         }
 202                 }
 203         }
 204         // Append weights for the matched contraction, which may be an expansion.
 205         i, n := scan.result()
 206         ce = Elem(t.ContractElem[i+offset])
 207         if ce.ctype() == ceNormal {
 208                 w = append(w, ce)
 209         } else {
 210                 w = t.appendExpansion(w, ce)
 211         }
 212         // Append weights for the runes in the segment not part of the contraction.
 213         for b, p := buf[:bufp], 0; len(b) > 0; b = b[p:] {
 214                 w, p = t.appendNext(w, source{bytes: b})
 215         }
 216         return w, n
 217 }
 218
 219 // TODO: unify the two implementations. This is best done after first simplifying
 220 // the algorithm taking into account the inclusion of both NFC and NFD forms
 221 // in the table.
 222 func (t *Table) matchContractionString(w []Elem, ce Elem, suffix string) ([]Elem, int) {
 223         index, n, offset := splitContractIndex(ce)
 224
 225         scan := t.ContractTries.scannerString(index, n, suffix)
 226         buf := [norm.MaxSegmentSize]byte{}
 227         bufp := 0
 228         p := scan.scan(0)
 229
 230         if !scan.done && p < len(suffix) && suffix[p] >= utf8.RuneSelf {
 231                 // By now we should have filtered most cases.
 232                 p0 := p
 233                 bufn := 0
 234                 rune := norm.NFD.PropertiesString(suffix[p:])
 235                 p += rune.Size()
 236                 if rune.LeadCCC() != 0 {
 237                         prevCC := rune.TrailCCC()
 238                         // A gap may only occur in the last normalization segment.
 239                         // This also ensures that len(scan.s) < norm.MaxSegmentSize.
 240                         if end := norm.NFD.FirstBoundaryInString(suffix[p:]); end != -1 {
 241                                 scan.s = suffix[:p+end]
 242                         }
 243                         for p < len(suffix) && !scan.done && suffix[p] >= utf8.RuneSelf {
 244                                 rune = norm.NFD.PropertiesString(suffix[p:])
 245                                 if ccc := rune.LeadCCC(); ccc == 0 || prevCC >= ccc {
 246                                         break
 247                                 }
 248                                 prevCC = rune.TrailCCC()
 249                                 if pp := scan.scan(p); pp != p {
 250                                         // Copy the interstitial runes for later processing.
 251                                         bufn += copy(buf[bufn:], suffix[p0:p])
 252                                         if scan.pindex == pp {
 253                                                 bufp = bufn
 254                                         }
 255                                         p, p0 = pp, pp
 256                                 } else {
 257                                         p += rune.Size()
 258                                 }
 259                         }
 260                 }
 261         }
 262         // Append weights for the matched contraction, which may be an expansion.
 263         i, n := scan.result()
 264         ce = Elem(t.ContractElem[i+offset])
 265         if ce.ctype() == ceNormal {
 266                 w = append(w, ce)
 267         } else {
 268                 w = t.appendExpansion(w, ce)
 269         }
 270         // Append weights for the runes in the segment not part of the contraction.
 271         for b, p := buf[:bufp], 0; len(b) > 0; b = b[p:] {
 272                 w, p = t.appendNext(w, source{bytes: b})
 273         }
 274         return w, n
 275 }