1 // Copyright 2016 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
7 // This program generates the trie for idna operations. The Unicode casing
8 // algorithm requires the lookup of various properties and mappings for each
9 // rune. The table generated by this generator combines several of the most
10 // frequently used of these into a single trie so that they can be accessed
11 // with a single lookup.
21 "golang.org/x/text/internal/gen"
22 "golang.org/x/text/internal/triegen"
23 "golang.org/x/text/internal/ucd"
29 gen.Repackage("gen_trieval.go", "trieval.go", "idna")
30 gen.Repackage("gen_common.go", "common_test.go", "idna")
33 var runes = map[rune]info{}
36 t := triegen.NewTrie("idna")
38 ucd.Parse(gen.OpenUCDFile("UnicodeData.txt"), func(p *ucd.Parser) {
42 if p.Int(ucd.CanonicalCombiningClass) == cccVirama {
43 runes[p.Rune(0)] = viramaModifier
46 case unicode.In(r, unicode.Mark):
51 ucd.Parse(gen.OpenUCDFile("extracted/DerivedJoiningType.txt"), func(p *ucd.Parser) {
52 switch v := p.String(1); v {
53 case "L", "D", "T", "R":
54 runes[p.Rune(0)] |= joinType[v] << joinShift
58 ucd.Parse(gen.OpenUnicodeFile("idna", "", "IdnaMappingTable.txt"), func(p *ucd.Parser) {
61 // The mappings table explicitly defines surrogates as invalid.
62 if !utf8.ValidRune(r) {
66 cat := catFromEntry(p)
67 isMapped := cat == mapped || cat == disallowedSTD3Mapped || cat == deviation
69 // Only include additional category information for non-mapped
70 // runes. The additional information is only used after mapping and
71 // the bits would clash with mapping information.
72 // TODO: it would be possible to inline this data and avoid
73 // additional lookups. This is quite tedious, though, so let's first
74 // see if we need this.
75 cat |= category(runes[r])
78 s := string(p.Runes(2))
79 if s != "" && !isMapped {
80 log.Fatalf("%U: Mapping with non-mapping category %d", r, cat)
82 t.Insert(r, uint64(makeEntry(r, s))+uint64(cat))
85 w := gen.NewCodeWriter()
86 defer w.WriteGoFile("tables.go", "idna")
88 gen.WriteUnicodeVersion(w)
90 w.WriteVar("mappings", string(mappings))
91 w.WriteVar("xorData", string(xorData))
93 sz, err := t.Gen(w, triegen.Compact(&normCompacter{}))
101 // mappings contains replacement strings for mapped runes, each prefixed
102 // with a byte containing the length of the following string.
104 mapCache = map[string]int{}
106 // xorData is like mappings, except that it contains XOR data.
107 // We split these two tables so that we don't get an overflow.
109 xorCache = map[string]int{}
112 // makeEntry creates a trie entry.
113 func makeEntry(r rune, mapped string) info {
116 if len(orig) != len(mapped) {
117 // Store the mapped value as is in the mappings table.
118 index := len(mappings)
119 if x, ok := mapCache[mapped]; ok {
122 mapCache[mapped] = index
123 mappings = append(mappings, byte(len(mapped)))
124 mappings = append(mappings, mapped...)
126 return info(index) << indexShift
129 // Create per-byte XOR mask.
131 for i := 0; i < len(orig); i++ {
132 b = append(b, orig[i]^mapped[i])
135 // Remove leading 0 bytes, but keep at least one byte.
136 for ; len(b) > 1 && b[0] == 0; b = b[1:] {
140 return xorBit | inlineXOR | info(b[0])<<indexShift
144 // Store the mapped value as is in the mappings table.
145 index := len(xorData)
146 if x, ok := xorCache[mapped]; ok {
149 xorCache[mapped] = index
150 xorData = append(xorData, byte(len(mapped)))
151 xorData = append(xorData, mapped...)
153 return xorBit | info(index)<<indexShift
156 // The following code implements a triegen.Compacter that was originally
157 // designed for normalization. The IDNA table has some similarities with the
158 // norm table. Using this compacter, together with the XOR pattern approach,
159 // reduces the table size by roughly 100K. It can probably be compressed further
160 // by also including elements of the compacter used by cases, but for now it is
163 const maxSparseEntries = 16
165 type normCompacter struct {
166 sparseBlocks [][]uint64
167 sparseOffset []uint16
171 func mostFrequentStride(a []uint64) int {
172 counts := make(map[int]int)
174 for _, x := range a {
175 if stride := int(x) - v; v != 0 && stride >= 0 {
181 for stride, cnt := range counts {
182 if cnt > maxc || (cnt == maxc && stride < maxs) {
183 maxs, maxc = stride, cnt
189 func countSparseEntries(a []uint64) int {
190 stride := mostFrequentStride(a)
192 for _, tv := range a {
193 if int(tv)-v != stride {
203 func (c *normCompacter) Size(v []uint64) (sz int, ok bool) {
204 if n := countSparseEntries(v); n <= maxSparseEntries {
205 return (n+1)*4 + 2, true
210 func (c *normCompacter) Store(v []uint64) uint32 {
211 h := uint32(len(c.sparseOffset))
212 c.sparseBlocks = append(c.sparseBlocks, v)
213 c.sparseOffset = append(c.sparseOffset, uint16(c.sparseCount))
214 c.sparseCount += countSparseEntries(v) + 1
218 func (c *normCompacter) Handler() string {
219 return "idnaSparse.lookup"
222 func (c *normCompacter) Print(w io.Writer) (retErr error) {
223 p := func(f string, x ...interface{}) {
224 if _, err := fmt.Fprintf(w, f, x...); retErr == nil && err != nil {
229 ls := len(c.sparseBlocks)
230 p("// idnaSparseOffset: %d entries, %d bytes\n", ls, ls*2)
231 p("var idnaSparseOffset = %#v\n\n", c.sparseOffset)
234 p("// idnaSparseValues: %d entries, %d bytes\n", ns, ns*4)
235 p("var idnaSparseValues = [%d]valueRange {", ns)
236 for i, b := range c.sparseBlocks {
237 p("\n// Block %#x, offset %#x", i, c.sparseOffset[i])
239 stride := mostFrequentStride(b)
240 n := countSparseEntries(b)
241 p("\n{value:%#04x,lo:%#02x},", stride, uint8(n))
242 for i, nv := range b {
243 if int(nv)-v != stride {
245 p(",hi:%#02x},", 0x80+i-1)
248 p("\n{value:%#04x,lo:%#02x", nv, 0x80+i)
254 p(",hi:%#02x},", 0x80+len(b)-1)