vendor/golang.org/x/text/internal/triegen/triegen.go

   1 // Copyright 2014 The Go Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style
   3 // license that can be found in the LICENSE file.
   4
   5 // Package triegen implements a code generator for a trie for associating
   6 // unsigned integer values with UTF-8 encoded runes.
   7 //
   8 // Many of the go.text packages use tries for storing per-rune information.  A
   9 // trie is especially useful if many of the runes have the same value. If this
  10 // is the case, many blocks can be expected to be shared allowing for
  11 // information on many runes to be stored in little space.
  12 //
  13 // As most of the lookups are done directly on []byte slices, the tries use the
  14 // UTF-8 bytes directly for the lookup. This saves a conversion from UTF-8 to
  15 // runes and contributes a little bit to better performance. It also naturally
  16 // provides a fast path for ASCII.
  17 //
  18 // Space is also an issue. There are many code points defined in Unicode and as
  19 // a result tables can get quite large. So every byte counts. The triegen
  20 // package automatically chooses the smallest integer values to represent the
  21 // tables. Compacters allow further compression of the trie by allowing for
  22 // alternative representations of individual trie blocks.
  23 //
  24 // triegen allows generating multiple tries as a single structure. This is
  25 // useful when, for example, one wants to generate tries for several languages
  26 // that have a lot of values in common. Some existing libraries for
  27 // internationalization store all per-language data as a dynamically loadable
  28 // chunk. The go.text packages are designed with the assumption that the user
  29 // typically wants to compile in support for all supported languages, in line
  30 // with the approach common to Go to create a single standalone binary. The
  31 // multi-root trie approach can give significant storage savings in this
  32 // scenario.
  33 //
  34 // triegen generates both tables and code. The code is optimized to use the
  35 // automatically chosen data types. The following code is generated for a Trie
  36 // or multiple Tries named "foo":
  37 //      - type fooTrie
  38 //              The trie type.
  39 //
  40 //      - func newFooTrie(x int) *fooTrie
  41 //              Trie constructor, where x is the index of the trie passed to Gen.
  42 //
  43 //      - func (t *fooTrie) lookup(s []byte) (v uintX, sz int)
  44 //              The lookup method, where uintX is automatically chosen.
  45 //
  46 //      - func lookupString, lookupUnsafe and lookupStringUnsafe
  47 //              Variants of the above.
  48 //
  49 //      - var fooValues and fooIndex and any tables generated by Compacters.
  50 //              The core trie data.
  51 //
  52 //      - var fooTrieHandles
  53 //              Indexes of starter blocks in case of multiple trie roots.
  54 //
  55 // It is recommended that users test the generated trie by checking the returned
  56 // value for every rune. Such exhaustive tests are possible as the the number of
  57 // runes in Unicode is limited.
  58 package triegen // import "golang.org/x/text/internal/triegen"
  59
  60 // TODO: Arguably, the internally optimized data types would not have to be
  61 // exposed in the generated API. We could also investigate not generating the
  62 // code, but using it through a package. We would have to investigate the impact
  63 // on performance of making such change, though. For packages like unicode/norm,
  64 // small changes like this could tank performance.
  65
  66 import (
  67         "encoding/binary"
  68         "fmt"
  69         "hash/crc64"
  70         "io"
  71         "log"
  72         "unicode/utf8"
  73 )
  74
  75 // builder builds a set of tries for associating values with runes. The set of
  76 // tries can share common index and value blocks.
  77 type builder struct {
  78         Name string
  79
  80         // ValueType is the type of the trie values looked up.
  81         ValueType string
  82
  83         // ValueSize is the byte size of the ValueType.
  84         ValueSize int
  85
  86         // IndexType is the type of trie index values used for all UTF-8 bytes of
  87         // a rune except the last one.
  88         IndexType string
  89
  90         // IndexSize is the byte size of the IndexType.
  91         IndexSize int
  92
  93         // SourceType is used when generating the lookup functions. If the user
  94         // requests StringSupport, all lookup functions will be generated for
  95         // string input as well.
  96         SourceType string
  97
  98         Trie []*Trie
  99
 100         IndexBlocks []*node
 101         ValueBlocks [][]uint64
 102         Compactions []compaction
 103         Checksum    uint64
 104
 105         ASCIIBlock   string
 106         StarterBlock string
 107
 108         indexBlockIdx map[uint64]int
 109         valueBlockIdx map[uint64]nodeIndex
 110         asciiBlockIdx map[uint64]int
 111
 112         // Stats are used to fill out the template.
 113         Stats struct {
 114                 NValueEntries int
 115                 NValueBytes   int
 116                 NIndexEntries int
 117                 NIndexBytes   int
 118                 NHandleBytes  int
 119         }
 120
 121         err error
 122 }
 123
 124 // A nodeIndex encodes the index of a node, which is defined by the compaction
 125 // which stores it and an index within the compaction. For internal nodes, the
 126 // compaction is always 0.
 127 type nodeIndex struct {
 128         compaction int
 129         index      int
 130 }
 131
 132 // compaction keeps track of stats used for the compaction.
 133 type compaction struct {
 134         c         Compacter
 135         blocks    []*node
 136         maxHandle uint32
 137         totalSize int
 138
 139         // Used by template-based generator and thus exported.
 140         Cutoff  uint32
 141         Offset  uint32
 142         Handler string
 143 }
 144
 145 func (b *builder) setError(err error) {
 146         if b.err == nil {
 147                 b.err = err
 148         }
 149 }
 150
 151 // An Option can be passed to Gen.
 152 type Option func(b *builder) error
 153
 154 // Compact configures the trie generator to use the given Compacter.
 155 func Compact(c Compacter) Option {
 156         return func(b *builder) error {
 157                 b.Compactions = append(b.Compactions, compaction{
 158                         c:       c,
 159                         Handler: c.Handler() + "(n, b)"})
 160                 return nil
 161         }
 162 }
 163
 164 // Gen writes Go code for a shared trie lookup structure to w for the given
 165 // Tries. The generated trie type will be called nameTrie. newNameTrie(x) will
 166 // return the *nameTrie for tries[x]. A value can be looked up by using one of
 167 // the various lookup methods defined on nameTrie. It returns the table size of
 168 // the generated trie.
 169 func Gen(w io.Writer, name string, tries []*Trie, opts ...Option) (sz int, err error) {
 170         // The index contains two dummy blocks, followed by the zero block. The zero
 171         // block is at offset 0x80, so that the offset for the zero block for
 172         // continuation bytes is 0.
 173         b := &builder{
 174                 Name:        name,
 175                 Trie:        tries,
 176                 IndexBlocks: []*node{{}, {}, {}},
 177                 Compactions: []compaction{{
 178                         Handler: name + "Values[n<<6+uint32(b)]",
 179                 }},
 180                 // The 0 key in indexBlockIdx and valueBlockIdx is the hash of the zero
 181                 // block.
 182                 indexBlockIdx: map[uint64]int{0: 0},
 183                 valueBlockIdx: map[uint64]nodeIndex{0: {}},
 184                 asciiBlockIdx: map[uint64]int{},
 185         }
 186         b.Compactions[0].c = (*simpleCompacter)(b)
 187
 188         for _, f := range opts {
 189                 if err := f(b); err != nil {
 190                         return 0, err
 191                 }
 192         }
 193         b.build()
 194         if b.err != nil {
 195                 return 0, b.err
 196         }
 197         if err = b.print(w); err != nil {
 198                 return 0, err
 199         }
 200         return b.Size(), nil
 201 }
 202
 203 // A Trie represents a single root node of a trie. A builder may build several
 204 // overlapping tries at once.
 205 type Trie struct {
 206         root *node
 207
 208         hiddenTrie
 209 }
 210
 211 // hiddenTrie contains values we want to be visible to the template generator,
 212 // but hidden from the API documentation.
 213 type hiddenTrie struct {
 214         Name         string
 215         Checksum     uint64
 216         ASCIIIndex   int
 217         StarterIndex int
 218 }
 219
 220 // NewTrie returns a new trie root.
 221 func NewTrie(name string) *Trie {
 222         return &Trie{
 223                 &node{
 224                         children: make([]*node, blockSize),
 225                         values:   make([]uint64, utf8.RuneSelf),
 226                 },
 227                 hiddenTrie{Name: name},
 228         }
 229 }
 230
 231 // Gen is a convenience wrapper around the Gen func passing t as the only trie
 232 // and uses the name passed to NewTrie. It returns the size of the generated
 233 // tables.
 234 func (t *Trie) Gen(w io.Writer, opts ...Option) (sz int, err error) {
 235         return Gen(w, t.Name, []*Trie{t}, opts...)
 236 }
 237
 238 // node is a node of the intermediate trie structure.
 239 type node struct {
 240         // children holds this node's children. It is always of length 64.
 241         // A child node may be nil.
 242         children []*node
 243
 244         // values contains the values of this node. If it is non-nil, this node is
 245         // either a root or leaf node:
 246         // For root nodes, len(values) == 128 and it maps the bytes in [0x00, 0x7F].
 247         // For leaf nodes, len(values) ==  64 and it maps the bytes in [0x80, 0xBF].
 248         values []uint64
 249
 250         index nodeIndex
 251 }
 252
 253 // Insert associates value with the given rune. Insert will panic if a non-zero
 254 // value is passed for an invalid rune.
 255 func (t *Trie) Insert(r rune, value uint64) {
 256         if value == 0 {
 257                 return
 258         }
 259         s := string(r)
 260         if []rune(s)[0] != r && value != 0 {
 261                 // Note: The UCD tables will always assign what amounts to a zero value
 262                 // to a surrogate. Allowing a zero value for an illegal rune allows
 263                 // users to iterate over [0..MaxRune] without having to explicitly
 264                 // exclude surrogates, which would be tedious.
 265                 panic(fmt.Sprintf("triegen: non-zero value for invalid rune %U", r))
 266         }
 267         if len(s) == 1 {
 268                 // It is a root node value (ASCII).
 269                 t.root.values[s[0]] = value
 270                 return
 271         }
 272
 273         n := t.root
 274         for ; len(s) > 1; s = s[1:] {
 275                 if n.children == nil {
 276                         n.children = make([]*node, blockSize)
 277                 }
 278                 p := s[0] % blockSize
 279                 c := n.children[p]
 280                 if c == nil {
 281                         c = &node{}
 282                         n.children[p] = c
 283                 }
 284                 if len(s) > 2 && c.values != nil {
 285                         log.Fatalf("triegen: insert(%U): found internal node with values", r)
 286                 }
 287                 n = c
 288         }
 289         if n.values == nil {
 290                 n.values = make([]uint64, blockSize)
 291         }
 292         if n.children != nil {
 293                 log.Fatalf("triegen: insert(%U): found leaf node that also has child nodes", r)
 294         }
 295         n.values[s[0]-0x80] = value
 296 }
 297
 298 // Size returns the number of bytes the generated trie will take to store. It
 299 // needs to be exported as it is used in the templates.
 300 func (b *builder) Size() int {
 301         // Index blocks.
 302         sz := len(b.IndexBlocks) * blockSize * b.IndexSize
 303
 304         // Skip the first compaction, which represents the normal value blocks, as
 305         // its totalSize does not account for the ASCII blocks, which are managed
 306         // separately.
 307         sz += len(b.ValueBlocks) * blockSize * b.ValueSize
 308         for _, c := range b.Compactions[1:] {
 309                 sz += c.totalSize
 310         }
 311
 312         // TODO: this computation does not account for the fixed overhead of a using
 313         // a compaction, either code or data. As for data, though, the typical
 314         // overhead of data is in the order of bytes (2 bytes for cases). Further,
 315         // the savings of using a compaction should anyway be substantial for it to
 316         // be worth it.
 317
 318         // For multi-root tries, we also need to account for the handles.
 319         if len(b.Trie) > 1 {
 320                 sz += 2 * b.IndexSize * len(b.Trie)
 321         }
 322         return sz
 323 }
 324
 325 func (b *builder) build() {
 326         // Compute the sizes of the values.
 327         var vmax uint64
 328         for _, t := range b.Trie {
 329                 vmax = maxValue(t.root, vmax)
 330         }
 331         b.ValueType, b.ValueSize = getIntType(vmax)
 332
 333         // Compute all block allocations.
 334         // TODO: first compute the ASCII blocks for all tries and then the other
 335         // nodes. ASCII blocks are more restricted in placement, as they require two
 336         // blocks to be placed consecutively. Processing them first may improve
 337         // sharing (at least one zero block can be expected to be saved.)
 338         for _, t := range b.Trie {
 339                 b.Checksum += b.buildTrie(t)
 340         }
 341
 342         // Compute the offsets for all the Compacters.
 343         offset := uint32(0)
 344         for i := range b.Compactions {
 345                 c := &b.Compactions[i]
 346                 c.Offset = offset
 347                 offset += c.maxHandle + 1
 348                 c.Cutoff = offset
 349         }
 350
 351         // Compute the sizes of indexes.
 352         // TODO: different byte positions could have different sizes. So far we have
 353         // not found a case where this is beneficial.
 354         imax := uint64(b.Compactions[len(b.Compactions)-1].Cutoff)
 355         for _, ib := range b.IndexBlocks {
 356                 if x := uint64(ib.index.index); x > imax {
 357                         imax = x
 358                 }
 359         }
 360         b.IndexType, b.IndexSize = getIntType(imax)
 361 }
 362
 363 func maxValue(n *node, max uint64) uint64 {
 364         if n == nil {
 365                 return max
 366         }
 367         for _, c := range n.children {
 368                 max = maxValue(c, max)
 369         }
 370         for _, v := range n.values {
 371                 if max < v {
 372                         max = v
 373                 }
 374         }
 375         return max
 376 }
 377
 378 func getIntType(v uint64) (string, int) {
 379         switch {
 380         case v < 1<<8:
 381                 return "uint8", 1
 382         case v < 1<<16:
 383                 return "uint16", 2
 384         case v < 1<<32:
 385                 return "uint32", 4
 386         }
 387         return "uint64", 8
 388 }
 389
 390 const (
 391         blockSize = 64
 392
 393         // Subtract two blocks to offset 0x80, the first continuation byte.
 394         blockOffset = 2
 395
 396         // Subtract three blocks to offset 0xC0, the first non-ASCII starter.
 397         rootBlockOffset = 3
 398 )
 399
 400 var crcTable = crc64.MakeTable(crc64.ISO)
 401
 402 func (b *builder) buildTrie(t *Trie) uint64 {
 403         n := t.root
 404
 405         // Get the ASCII offset. For the first trie, the ASCII block will be at
 406         // position 0.
 407         hasher := crc64.New(crcTable)
 408         binary.Write(hasher, binary.BigEndian, n.values)
 409         hash := hasher.Sum64()
 410
 411         v, ok := b.asciiBlockIdx[hash]
 412         if !ok {
 413                 v = len(b.ValueBlocks)
 414                 b.asciiBlockIdx[hash] = v
 415
 416                 b.ValueBlocks = append(b.ValueBlocks, n.values[:blockSize], n.values[blockSize:])
 417                 if v == 0 {
 418                         // Add the zero block at position 2 so that it will be assigned a
 419                         // zero reference in the lookup blocks.
 420                         // TODO: always do this? This would allow us to remove a check from
 421                         // the trie lookup, but at the expense of extra space. Analyze
 422                         // performance for unicode/norm.
 423                         b.ValueBlocks = append(b.ValueBlocks, make([]uint64, blockSize))
 424                 }
 425         }
 426         t.ASCIIIndex = v
 427
 428         // Compute remaining offsets.
 429         t.Checksum = b.computeOffsets(n, true)
 430         // We already subtracted the normal blockOffset from the index. Subtract the
 431         // difference for starter bytes.
 432         t.StarterIndex = n.index.index - (rootBlockOffset - blockOffset)
 433         return t.Checksum
 434 }
 435
 436 func (b *builder) computeOffsets(n *node, root bool) uint64 {
 437         // For the first trie, the root lookup block will be at position 3, which is
 438         // the offset for UTF-8 non-ASCII starter bytes.
 439         first := len(b.IndexBlocks) == rootBlockOffset
 440         if first {
 441                 b.IndexBlocks = append(b.IndexBlocks, n)
 442         }
 443
 444         // We special-case the cases where all values recursively are 0. This allows
 445         // for the use of a zero block to which all such values can be directed.
 446         hash := uint64(0)
 447         if n.children != nil || n.values != nil {
 448                 hasher := crc64.New(crcTable)
 449                 for _, c := range n.children {
 450                         var v uint64
 451                         if c != nil {
 452                                 v = b.computeOffsets(c, false)
 453                         }
 454                         binary.Write(hasher, binary.BigEndian, v)
 455                 }
 456                 binary.Write(hasher, binary.BigEndian, n.values)
 457                 hash = hasher.Sum64()
 458         }
 459
 460         if first {
 461                 b.indexBlockIdx[hash] = rootBlockOffset - blockOffset
 462         }
 463
 464         // Compacters don't apply to internal nodes.
 465         if n.children != nil {
 466                 v, ok := b.indexBlockIdx[hash]
 467                 if !ok {
 468                         v = len(b.IndexBlocks) - blockOffset
 469                         b.IndexBlocks = append(b.IndexBlocks, n)
 470                         b.indexBlockIdx[hash] = v
 471                 }
 472                 n.index = nodeIndex{0, v}
 473         } else {
 474                 h, ok := b.valueBlockIdx[hash]
 475                 if !ok {
 476                         bestI, bestSize := 0, blockSize*b.ValueSize
 477                         for i, c := range b.Compactions[1:] {
 478                                 if sz, ok := c.c.Size(n.values); ok && bestSize > sz {
 479                                         bestI, bestSize = i+1, sz
 480                                 }
 481                         }
 482                         c := &b.Compactions[bestI]
 483                         c.totalSize += bestSize
 484                         v := c.c.Store(n.values)
 485                         if c.maxHandle < v {
 486                                 c.maxHandle = v
 487                         }
 488                         h = nodeIndex{bestI, int(v)}
 489                         b.valueBlockIdx[hash] = h
 490                 }
 491                 n.index = h
 492         }
 493         return hash
 494 }