vendor/golang.org/x/text/internal/colltab/iter.go

   1 // Copyright 2015 The Go Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style
   3 // license that can be found in the LICENSE file.
   4
   5 package colltab
   6
   7 // An Iter incrementally converts chunks of the input text to collation
   8 // elements, while ensuring that the collation elements are in normalized order
   9 // (that is, they are in the order as if the input text were normalized first).
  10 type Iter struct {
  11         Weighter Weighter
  12         Elems    []Elem
  13         // N is the number of elements in Elems that will not be reordered on
  14         // subsequent iterations, N <= len(Elems).
  15         N int
  16
  17         bytes []byte
  18         str   string
  19         // Because the Elems buffer may contain collation elements that are needed
  20         // for look-ahead, we need two positions in the text (bytes or str): one for
  21         // the end position in the text for the current iteration and one for the
  22         // start of the next call to appendNext.
  23         pEnd  int // end position in text corresponding to N.
  24         pNext int // pEnd <= pNext.
  25 }
  26
  27 // Reset sets the position in the current input text to p and discards any
  28 // results obtained so far.
  29 func (i *Iter) Reset(p int) {
  30         i.Elems = i.Elems[:0]
  31         i.N = 0
  32         i.pEnd = p
  33         i.pNext = p
  34 }
  35
  36 // Len returns the length of the input text.
  37 func (i *Iter) Len() int {
  38         if i.bytes != nil {
  39                 return len(i.bytes)
  40         }
  41         return len(i.str)
  42 }
  43
  44 // Discard removes the collation elements up to N.
  45 func (i *Iter) Discard() {
  46         // TODO: change this such that only modifiers following starters will have
  47         // to be copied.
  48         i.Elems = i.Elems[:copy(i.Elems, i.Elems[i.N:])]
  49         i.N = 0
  50 }
  51
  52 // End returns the end position of the input text for which Next has returned
  53 // results.
  54 func (i *Iter) End() int {
  55         return i.pEnd
  56 }
  57
  58 // SetInput resets i to input s.
  59 func (i *Iter) SetInput(s []byte) {
  60         i.bytes = s
  61         i.str = ""
  62         i.Reset(0)
  63 }
  64
  65 // SetInputString resets i to input s.
  66 func (i *Iter) SetInputString(s string) {
  67         i.str = s
  68         i.bytes = nil
  69         i.Reset(0)
  70 }
  71
  72 func (i *Iter) done() bool {
  73         return i.pNext >= len(i.str) && i.pNext >= len(i.bytes)
  74 }
  75
  76 func (i *Iter) appendNext() bool {
  77         if i.done() {
  78                 return false
  79         }
  80         var sz int
  81         if i.bytes == nil {
  82                 i.Elems, sz = i.Weighter.AppendNextString(i.Elems, i.str[i.pNext:])
  83         } else {
  84                 i.Elems, sz = i.Weighter.AppendNext(i.Elems, i.bytes[i.pNext:])
  85         }
  86         if sz == 0 {
  87                 sz = 1
  88         }
  89         i.pNext += sz
  90         return true
  91 }
  92
  93 // Next appends Elems to the internal array. On each iteration, it will either
  94 // add starters or modifiers. In the majority of cases, an Elem with a primary
  95 // value > 0 will have a CCC of 0. The CCC values of collation elements are also
  96 // used to detect if the input string was not normalized and to adjust the
  97 // result accordingly.
  98 func (i *Iter) Next() bool {
  99         if i.N == len(i.Elems) && !i.appendNext() {
 100                 return false
 101         }
 102
 103         // Check if the current segment starts with a starter.
 104         prevCCC := i.Elems[len(i.Elems)-1].CCC()
 105         if prevCCC == 0 {
 106                 i.N = len(i.Elems)
 107                 i.pEnd = i.pNext
 108                 return true
 109         } else if i.Elems[i.N].CCC() == 0 {
 110                 // set i.N to only cover part of i.Elems for which prevCCC == 0 and
 111                 // use rest for the next call to next.
 112                 for i.N++; i.N < len(i.Elems) && i.Elems[i.N].CCC() == 0; i.N++ {
 113                 }
 114                 i.pEnd = i.pNext
 115                 return true
 116         }
 117
 118         // The current (partial) segment starts with modifiers. We need to collect
 119         // all successive modifiers to ensure that they are normalized.
 120         for {
 121                 p := len(i.Elems)
 122                 i.pEnd = i.pNext
 123                 if !i.appendNext() {
 124                         break
 125                 }
 126
 127                 if ccc := i.Elems[p].CCC(); ccc == 0 || len(i.Elems)-i.N > maxCombiningCharacters {
 128                         // Leave the starter for the next iteration. This ensures that we
 129                         // do not return sequences of collation elements that cross two
 130                         // segments.
 131                         //
 132                         // TODO: handle large number of combining characters by fully
 133                         // normalizing the input segment before iteration. This ensures
 134                         // results are consistent across the text repo.
 135                         i.N = p
 136                         return true
 137                 } else if ccc < prevCCC {
 138                         i.doNorm(p, ccc) // should be rare, never occurs for NFD and FCC.
 139                 } else {
 140                         prevCCC = ccc
 141                 }
 142         }
 143
 144         done := len(i.Elems) != i.N
 145         i.N = len(i.Elems)
 146         return done
 147 }
 148
 149 // nextNoNorm is the same as next, but does not "normalize" the collation
 150 // elements.
 151 func (i *Iter) nextNoNorm() bool {
 152         // TODO: remove this function. Using this instead of next does not seem
 153         // to improve performance in any significant way. We retain this until
 154         // later for evaluation purposes.
 155         if i.done() {
 156                 return false
 157         }
 158         i.appendNext()
 159         i.N = len(i.Elems)
 160         return true
 161 }
 162
 163 const maxCombiningCharacters = 30
 164
 165 // doNorm reorders the collation elements in i.Elems.
 166 // It assumes that blocks of collation elements added with appendNext
 167 // either start and end with the same CCC or start with CCC == 0.
 168 // This allows for a single insertion point for the entire block.
 169 // The correctness of this assumption is verified in builder.go.
 170 func (i *Iter) doNorm(p int, ccc uint8) {
 171         n := len(i.Elems)
 172         k := p
 173         for p--; p > i.N && ccc < i.Elems[p-1].CCC(); p-- {
 174         }
 175         i.Elems = append(i.Elems, i.Elems[p:k]...)
 176         copy(i.Elems[p:], i.Elems[k:])
 177         i.Elems = i.Elems[:n]
 178 }