vendor/golang.org/x/text/collate/build/colelem.go

   1 // Copyright 2012 The Go Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style
   3 // license that can be found in the LICENSE file.
   4
   5 package build
   6
   7 import (
   8         "fmt"
   9         "unicode"
  10
  11         "golang.org/x/text/internal/colltab"
  12 )
  13
  14 const (
  15         defaultSecondary = 0x20
  16         defaultTertiary  = 0x2
  17         maxTertiary      = 0x1F
  18 )
  19
  20 type rawCE struct {
  21         w   []int
  22         ccc uint8
  23 }
  24
  25 func makeRawCE(w []int, ccc uint8) rawCE {
  26         ce := rawCE{w: make([]int, 4), ccc: ccc}
  27         copy(ce.w, w)
  28         return ce
  29 }
  30
  31 // A collation element is represented as an uint32.
  32 // In the typical case, a rune maps to a single collation element. If a rune
  33 // can be the start of a contraction or expands into multiple collation elements,
  34 // then the collation element that is associated with a rune will have a special
  35 // form to represent such m to n mappings.  Such special collation elements
  36 // have a value >= 0x80000000.
  37
  38 const (
  39         maxPrimaryBits   = 21
  40         maxSecondaryBits = 12
  41         maxTertiaryBits  = 8
  42 )
  43
  44 func makeCE(ce rawCE) (uint32, error) {
  45         v, e := colltab.MakeElem(ce.w[0], ce.w[1], ce.w[2], ce.ccc)
  46         return uint32(v), e
  47 }
  48
  49 // For contractions, collation elements are of the form
  50 // 110bbbbb bbbbbbbb iiiiiiii iiiinnnn, where
  51 //   - n* is the size of the first node in the contraction trie.
  52 //   - i* is the index of the first node in the contraction trie.
  53 //   - b* is the offset into the contraction collation element table.
  54 // See contract.go for details on the contraction trie.
  55 const (
  56         contractID            = 0xC0000000
  57         maxNBits              = 4
  58         maxTrieIndexBits      = 12
  59         maxContractOffsetBits = 13
  60 )
  61
  62 func makeContractIndex(h ctHandle, offset int) (uint32, error) {
  63         if h.n >= 1<<maxNBits {
  64                 return 0, fmt.Errorf("size of contraction trie node too large: %d >= %d", h.n, 1<<maxNBits)
  65         }
  66         if h.index >= 1<<maxTrieIndexBits {
  67                 return 0, fmt.Errorf("size of contraction trie offset too large: %d >= %d", h.index, 1<<maxTrieIndexBits)
  68         }
  69         if offset >= 1<<maxContractOffsetBits {
  70                 return 0, fmt.Errorf("contraction offset out of bounds: %x >= %x", offset, 1<<maxContractOffsetBits)
  71         }
  72         ce := uint32(contractID)
  73         ce += uint32(offset << (maxNBits + maxTrieIndexBits))
  74         ce += uint32(h.index << maxNBits)
  75         ce += uint32(h.n)
  76         return ce, nil
  77 }
  78
  79 // For expansions, collation elements are of the form
  80 // 11100000 00000000 bbbbbbbb bbbbbbbb,
  81 // where b* is the index into the expansion sequence table.
  82 const (
  83         expandID           = 0xE0000000
  84         maxExpandIndexBits = 16
  85 )
  86
  87 func makeExpandIndex(index int) (uint32, error) {
  88         if index >= 1<<maxExpandIndexBits {
  89                 return 0, fmt.Errorf("expansion index out of bounds: %x >= %x", index, 1<<maxExpandIndexBits)
  90         }
  91         return expandID + uint32(index), nil
  92 }
  93
  94 // Each list of collation elements corresponding to an expansion starts with
  95 // a header indicating the length of the sequence.
  96 func makeExpansionHeader(n int) (uint32, error) {
  97         return uint32(n), nil
  98 }
  99
 100 // Some runes can be expanded using NFKD decomposition. Instead of storing the full
 101 // sequence of collation elements, we decompose the rune and lookup the collation
 102 // elements for each rune in the decomposition and modify the tertiary weights.
 103 // The collation element, in this case, is of the form
 104 // 11110000 00000000 wwwwwwww vvvvvvvv, where
 105 //   - v* is the replacement tertiary weight for the first rune,
 106 //   - w* is the replacement tertiary weight for the second rune,
 107 // Tertiary weights of subsequent runes should be replaced with maxTertiary.
 108 // See http://www.unicode.org/reports/tr10/#Compatibility_Decompositions for more details.
 109 const (
 110         decompID = 0xF0000000
 111 )
 112
 113 func makeDecompose(t1, t2 int) (uint32, error) {
 114         if t1 >= 256 || t1 < 0 {
 115                 return 0, fmt.Errorf("first tertiary weight out of bounds: %d >= 256", t1)
 116         }
 117         if t2 >= 256 || t2 < 0 {
 118                 return 0, fmt.Errorf("second tertiary weight out of bounds: %d >= 256", t2)
 119         }
 120         return uint32(t2<<8+t1) + decompID, nil
 121 }
 122
 123 const (
 124         // These constants were taken from http://www.unicode.org/versions/Unicode6.0.0/ch12.pdf.
 125         minUnified       rune = 0x4E00
 126         maxUnified            = 0x9FFF
 127         minCompatibility      = 0xF900
 128         maxCompatibility      = 0xFAFF
 129         minRare               = 0x3400
 130         maxRare               = 0x4DBF
 131 )
 132 const (
 133         commonUnifiedOffset = 0x10000
 134         rareUnifiedOffset   = 0x20000 // largest rune in common is U+FAFF
 135         otherOffset         = 0x50000 // largest rune in rare is U+2FA1D
 136         illegalOffset       = otherOffset + int(unicode.MaxRune)
 137         maxPrimary          = illegalOffset + 1
 138 )
 139
 140 // implicitPrimary returns the primary weight for the a rune
 141 // for which there is no entry for the rune in the collation table.
 142 // We take a different approach from the one specified in
 143 // http://unicode.org/reports/tr10/#Implicit_Weights,
 144 // but preserve the resulting relative ordering of the runes.
 145 func implicitPrimary(r rune) int {
 146         if unicode.Is(unicode.Ideographic, r) {
 147                 if r >= minUnified && r <= maxUnified {
 148                         // The most common case for CJK.
 149                         return int(r) + commonUnifiedOffset
 150                 }
 151                 if r >= minCompatibility && r <= maxCompatibility {
 152                         // This will typically not hit. The DUCET explicitly specifies mappings
 153                         // for all characters that do not decompose.
 154                         return int(r) + commonUnifiedOffset
 155                 }
 156                 return int(r) + rareUnifiedOffset
 157         }
 158         return int(r) + otherOffset
 159 }
 160
 161 // convertLargeWeights converts collation elements with large
 162 // primaries (either double primaries or for illegal runes)
 163 // to our own representation.
 164 // A CJK character C is represented in the DUCET as
 165 //   [.FBxx.0020.0002.C][.BBBB.0000.0000.C]
 166 // We will rewrite these characters to a single CE.
 167 // We assume the CJK values start at 0x8000.
 168 // See http://unicode.org/reports/tr10/#Implicit_Weights
 169 func convertLargeWeights(elems []rawCE) (res []rawCE, err error) {
 170         const (
 171                 cjkPrimaryStart   = 0xFB40
 172                 rarePrimaryStart  = 0xFB80
 173                 otherPrimaryStart = 0xFBC0
 174                 illegalPrimary    = 0xFFFE
 175                 highBitsMask      = 0x3F
 176                 lowBitsMask       = 0x7FFF
 177                 lowBitsFlag       = 0x8000
 178                 shiftBits         = 15
 179         )
 180         for i := 0; i < len(elems); i++ {
 181                 ce := elems[i].w
 182                 p := ce[0]
 183                 if p < cjkPrimaryStart {
 184                         continue
 185                 }
 186                 if p > 0xFFFF {
 187                         return elems, fmt.Errorf("found primary weight %X; should be <= 0xFFFF", p)
 188                 }
 189                 if p >= illegalPrimary {
 190                         ce[0] = illegalOffset + p - illegalPrimary
 191                 } else {
 192                         if i+1 >= len(elems) {
 193                                 return elems, fmt.Errorf("second part of double primary weight missing: %v", elems)
 194                         }
 195                         if elems[i+1].w[0]&lowBitsFlag == 0 {
 196                                 return elems, fmt.Errorf("malformed second part of double primary weight: %v", elems)
 197                         }
 198                         np := ((p & highBitsMask) << shiftBits) + elems[i+1].w[0]&lowBitsMask
 199                         switch {
 200                         case p < rarePrimaryStart:
 201                                 np += commonUnifiedOffset
 202                         case p < otherPrimaryStart:
 203                                 np += rareUnifiedOffset
 204                         default:
 205                                 p += otherOffset
 206                         }
 207                         ce[0] = np
 208                         for j := i + 1; j+1 < len(elems); j++ {
 209                                 elems[j] = elems[j+1]
 210                         }
 211                         elems = elems[:len(elems)-1]
 212                 }
 213         }
 214         return elems, nil
 215 }
 216
 217 // nextWeight computes the first possible collation weights following elems
 218 // for the given level.
 219 func nextWeight(level colltab.Level, elems []rawCE) []rawCE {
 220         if level == colltab.Identity {
 221                 next := make([]rawCE, len(elems))
 222                 copy(next, elems)
 223                 return next
 224         }
 225         next := []rawCE{makeRawCE(elems[0].w, elems[0].ccc)}
 226         next[0].w[level]++
 227         if level < colltab.Secondary {
 228                 next[0].w[colltab.Secondary] = defaultSecondary
 229         }
 230         if level < colltab.Tertiary {
 231                 next[0].w[colltab.Tertiary] = defaultTertiary
 232         }
 233         // Filter entries that cannot influence ordering.
 234         for _, ce := range elems[1:] {
 235                 skip := true
 236                 for i := colltab.Primary; i < level; i++ {
 237                         skip = skip && ce.w[i] == 0
 238                 }
 239                 if !skip {
 240                         next = append(next, ce)
 241                 }
 242         }
 243         return next
 244 }
 245
 246 func nextVal(elems []rawCE, i int, level colltab.Level) (index, value int) {
 247         for ; i < len(elems) && elems[i].w[level] == 0; i++ {
 248         }
 249         if i < len(elems) {
 250                 return i, elems[i].w[level]
 251         }
 252         return i, 0
 253 }
 254
 255 // compareWeights returns -1 if a < b, 1 if a > b, or 0 otherwise.
 256 // It also returns the collation level at which the difference is found.
 257 func compareWeights(a, b []rawCE) (result int, level colltab.Level) {
 258         for level := colltab.Primary; level < colltab.Identity; level++ {
 259                 var va, vb int
 260                 for ia, ib := 0, 0; ia < len(a) || ib < len(b); ia, ib = ia+1, ib+1 {
 261                         ia, va = nextVal(a, ia, level)
 262                         ib, vb = nextVal(b, ib, level)
 263                         if va != vb {
 264                                 if va < vb {
 265                                         return -1, level
 266                                 } else {
 267                                         return 1, level
 268                                 }
 269                         }
 270                 }
 271         }
 272         return 0, colltab.Identity
 273 }
 274
 275 func equalCE(a, b rawCE) bool {
 276         for i := 0; i < 3; i++ {
 277                 if b.w[i] != a.w[i] {
 278                         return false
 279                 }
 280         }
 281         return true
 282 }
 283
 284 func equalCEArrays(a, b []rawCE) bool {
 285         if len(a) != len(b) {
 286                 return false
 287         }
 288         for i := range a {
 289                 if !equalCE(a[i], b[i]) {
 290                         return false
 291                 }
 292         }
 293         return true
 294 }