1 // Copyright 2012 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
9 // This program generates table.go and table_test.go based on the authoritative
10 // public suffix list at https://publicsuffix.org/list/effective_tld_names.dat
12 // The version is derived from
13 // https://api.github.com/repos/publicsuffix/list/commits?path=public_suffix_list.dat
14 // and a human-readable form is at
15 // https://github.com/publicsuffix/list/commits/master/public_suffix_list.dat
17 // To fetch a particular git revision, such as 5c70ccd250, pass
18 // -url "https://raw.githubusercontent.com/publicsuffix/list/5c70ccd250/public_suffix_list.dat"
19 // and -version "an explicit version string".
35 "golang.org/x/net/idna"
39 // These sum of these four values must be no greater than 32.
42 nodesBitsTextOffset = 15
43 nodesBitsTextLength = 6
45 // These sum of these four values must be no greater than 32.
46 childrenBitsWildcard = 1
47 childrenBitsNodeType = 2
60 func max(a, b int) int {
67 func u32max(a, b uint32) uint32 {
77 nodeTypeParentOnly = 2
81 func nodeTypeStr(n int) string {
85 case nodeTypeException:
87 case nodeTypeParentOnly:
94 defaultURL = "https://publicsuffix.org/list/effective_tld_names.dat"
95 gitCommitURL = "https://api.github.com/repos/publicsuffix/list/commits?path=public_suffix_list.dat"
99 labelEncoding = map[string]uint32{}
100 labelsList = []string{}
101 labelsMap = map[string]bool{}
104 // validSuffixRE is used to check that the entries in the public suffix
105 // list are in canonical form (after Punycode encoding). Specifically,
106 // capital letters are not allowed.
107 validSuffixRE = regexp.MustCompile(`^[a-z0-9_\!\*\-\.]+$`)
109 shaRE = regexp.MustCompile(`"sha":"([^"]+)"`)
110 dateRE = regexp.MustCompile(`"committer":{[^{]+"date":"([^"]+)"`)
112 comments = flag.Bool("comments", false, "generate table.go comments, for debugging")
113 subset = flag.Bool("subset", false, "generate only a subset of the full table, for debugging")
114 url = flag.String("url", defaultURL, "URL of the publicsuffix.org list. If empty, stdin is read instead")
115 v = flag.Bool("v", false, "verbose output (to stderr)")
116 version = flag.String("version", "", "the effective_tld_names.dat version")
120 if err := main1(); err != nil {
121 fmt.Fprintln(os.Stderr, err)
128 if nodesBitsTextLength+nodesBitsTextOffset+nodesBitsICANN+nodesBitsChildren > 32 {
129 return fmt.Errorf("not enough bits to encode the nodes table")
131 if childrenBitsLo+childrenBitsHi+childrenBitsNodeType+childrenBitsWildcard > 32 {
132 return fmt.Errorf("not enough bits to encode the children table")
135 if *url != defaultURL {
136 return fmt.Errorf("-version was not specified, and the -url is not the default one")
138 sha, date, err := gitCommit()
142 *version = fmt.Sprintf("publicsuffix.org's public_suffix_list.dat, git revision %s (%s)", sha, date)
144 var r io.Reader = os.Stdin
146 res, err := http.Get(*url)
150 if res.StatusCode != http.StatusOK {
151 return fmt.Errorf("bad GET status for %s: %d", *url, res.Status)
154 defer res.Body.Close()
159 br := bufio.NewReader(r)
161 s, err := br.ReadString('\n')
168 s = strings.TrimSpace(s)
169 if strings.Contains(s, "BEGIN ICANN DOMAINS") {
173 if strings.Contains(s, "END ICANN DOMAINS") {
177 if s == "" || strings.HasPrefix(s, "//") {
180 s, err = idna.ToASCII(s)
184 if !validSuffixRE.MatchString(s) {
185 return fmt.Errorf("bad publicsuffix.org list data: %q", s)
190 case s == "ac.jp" || strings.HasSuffix(s, ".ac.jp"):
191 case s == "ak.us" || strings.HasSuffix(s, ".ak.us"):
192 case s == "ao" || strings.HasSuffix(s, ".ao"):
193 case s == "ar" || strings.HasSuffix(s, ".ar"):
194 case s == "arpa" || strings.HasSuffix(s, ".arpa"):
195 case s == "cy" || strings.HasSuffix(s, ".cy"):
196 case s == "dyndns.org" || strings.HasSuffix(s, ".dyndns.org"):
198 case s == "kobe.jp" || strings.HasSuffix(s, ".kobe.jp"):
199 case s == "kyoto.jp" || strings.HasSuffix(s, ".kyoto.jp"):
200 case s == "om" || strings.HasSuffix(s, ".om"):
201 case s == "uk" || strings.HasSuffix(s, ".uk"):
202 case s == "uk.com" || strings.HasSuffix(s, ".uk.com"):
203 case s == "tw" || strings.HasSuffix(s, ".tw"):
204 case s == "zw" || strings.HasSuffix(s, ".zw"):
205 case s == "xn--p1ai" || strings.HasSuffix(s, ".xn--p1ai"):
206 // xn--p1ai is Russian-Cyrillic "рф".
212 rules = append(rules, s)
214 nt, wildcard := nodeTypeNormal, false
216 case strings.HasPrefix(s, "*."):
217 s, nt = s[2:], nodeTypeParentOnly
219 case strings.HasPrefix(s, "!"):
220 s, nt = s[1:], nodeTypeException
222 labels := strings.Split(s, ".")
223 for n, i := &root, len(labels)-1; i >= 0; i-- {
227 if nt != nodeTypeParentOnly && n.nodeType == nodeTypeParentOnly {
230 n.icann = n.icann && icann
231 n.wildcard = n.wildcard || wildcard
233 labelsMap[label] = true
236 labelsList = make([]string, 0, len(labelsMap))
237 for label := range labelsMap {
238 labelsList = append(labelsList, label)
240 sort.Strings(labelsList)
242 if err := generate(printReal, &root, "table.go"); err != nil {
245 if err := generate(printTest, &root, "table_test.go"); err != nil {
251 func generate(p func(io.Writer, *node) error, root *node, filename string) error {
252 buf := new(bytes.Buffer)
253 if err := p(buf, root); err != nil {
256 b, err := format.Source(buf.Bytes())
260 return ioutil.WriteFile(filename, b, 0644)
263 func gitCommit() (sha, date string, retErr error) {
264 res, err := http.Get(gitCommitURL)
268 if res.StatusCode != http.StatusOK {
269 return "", "", fmt.Errorf("bad GET status for %s: %d", gitCommitURL, res.Status)
271 defer res.Body.Close()
272 b, err := ioutil.ReadAll(res.Body)
276 if m := shaRE.FindSubmatch(b); m != nil {
279 if m := dateRE.FindSubmatch(b); m != nil {
282 if sha == "" || date == "" {
283 retErr = fmt.Errorf("could not find commit SHA and date in %s", gitCommitURL)
285 return sha, date, retErr
288 func printTest(w io.Writer, n *node) error {
289 fmt.Fprintf(w, "// generated by go run gen.go; DO NOT EDIT\n\n")
290 fmt.Fprintf(w, "package publicsuffix\n\nvar rules = [...]string{\n")
291 for _, rule := range rules {
292 fmt.Fprintf(w, "%q,\n", rule)
294 fmt.Fprintf(w, "}\n\nvar nodeLabels = [...]string{\n")
295 if err := n.walk(w, printNodeLabel); err != nil {
298 fmt.Fprintf(w, "}\n")
302 func printReal(w io.Writer, n *node) error {
303 const header = `// generated by go run gen.go; DO NOT EDIT
310 nodesBitsChildren = %d
312 nodesBitsTextOffset = %d
313 nodesBitsTextLength = %d
315 childrenBitsWildcard = %d
316 childrenBitsNodeType = %d
323 nodeTypeException = %d
324 nodeTypeParentOnly = %d
327 // numTLD is the number of top level domains.
331 fmt.Fprintf(w, header, *version,
332 nodesBitsChildren, nodesBitsICANN, nodesBitsTextOffset, nodesBitsTextLength,
333 childrenBitsWildcard, childrenBitsNodeType, childrenBitsHi, childrenBitsLo,
334 nodeTypeNormal, nodeTypeException, nodeTypeParentOnly, len(n.children))
336 text := combineText(labelsList)
338 return fmt.Errorf("internal error: makeText returned no text")
340 for _, label := range labelsList {
341 offset, length := strings.Index(text, label), len(label)
343 return fmt.Errorf("internal error: could not find %q in text %q", label, text)
345 maxTextOffset, maxTextLength = max(maxTextOffset, offset), max(maxTextLength, length)
346 if offset >= 1<<nodesBitsTextOffset {
347 return fmt.Errorf("text offset %d is too large, or nodeBitsTextOffset is too small", offset)
349 if length >= 1<<nodesBitsTextLength {
350 return fmt.Errorf("text length %d is too large, or nodeBitsTextLength is too small", length)
352 labelEncoding[label] = uint32(offset)<<nodesBitsTextLength | uint32(length)
354 fmt.Fprintf(w, "// Text is the combined text of all labels.\nconst text = ")
356 n, plus := len(text), ""
360 fmt.Fprintf(w, "%q%s\n", text[:n], plus)
364 if err := n.walk(w, assignIndexes); err != nil {
370 // nodes is the list of nodes. Each node is represented as a uint32, which
371 // encodes the node's children, wildcard bit and node type (as an index into
372 // the children array), ICANN bit and text.
374 // If the table was generated with the -comments flag, there is a //-comment
375 // after each node's data. In it is the nodes-array indexes of the children,
376 // formatted as (n0x1234-n0x1256), with * denoting the wildcard bit. The
377 // nodeType is printed as + for normal, ! for exception, and o for parent-only
378 // nodes that have children but don't match a domain label in their own right.
379 // An I denotes an ICANN domain.
381 // The layout within the uint32, from MSB to LSB, is:
383 // [%2d bits] children index
384 // [%2d bits] ICANN bit
385 // [%2d bits] text index
386 // [%2d bits] text length
387 var nodes = [...]uint32{
389 32-nodesBitsChildren-nodesBitsICANN-nodesBitsTextOffset-nodesBitsTextLength,
390 nodesBitsChildren, nodesBitsICANN, nodesBitsTextOffset, nodesBitsTextLength)
391 if err := n.walk(w, printNode); err != nil {
396 // children is the list of nodes' children, the parent's wildcard bit and the
397 // parent's node type. If a node has no children then their children index
398 // will be in the range [0, 6), depending on the wildcard bit and node type.
400 // The layout within the uint32, from MSB to LSB, is:
402 // [%2d bits] wildcard bit
403 // [%2d bits] node type
404 // [%2d bits] high nodes index (exclusive) of children
405 // [%2d bits] low nodes index (inclusive) of children
406 var children=[...]uint32{
408 32-childrenBitsWildcard-childrenBitsNodeType-childrenBitsHi-childrenBitsLo,
409 childrenBitsWildcard, childrenBitsNodeType, childrenBitsHi, childrenBitsLo)
410 for i, c := range childrenEncoding {
411 s := "---------------"
412 lo := c & (1<<childrenBitsLo - 1)
413 hi := (c >> childrenBitsLo) & (1<<childrenBitsHi - 1)
415 s = fmt.Sprintf("n0x%04x-n0x%04x", lo, hi)
417 nodeType := int(c>>(childrenBitsLo+childrenBitsHi)) & (1<<childrenBitsNodeType - 1)
418 wildcard := c>>(childrenBitsLo+childrenBitsHi+childrenBitsNodeType) != 0
420 fmt.Fprintf(w, "0x%08x, // c0x%04x (%s)%s %s\n",
421 c, i, s, wildcardStr(wildcard), nodeTypeStr(nodeType))
423 fmt.Fprintf(w, "0x%x,\n", c)
426 fmt.Fprintf(w, "}\n\n")
427 fmt.Fprintf(w, "// max children %d (capacity %d)\n", maxChildren, 1<<nodesBitsChildren-1)
428 fmt.Fprintf(w, "// max text offset %d (capacity %d)\n", maxTextOffset, 1<<nodesBitsTextOffset-1)
429 fmt.Fprintf(w, "// max text length %d (capacity %d)\n", maxTextLength, 1<<nodesBitsTextLength-1)
430 fmt.Fprintf(w, "// max hi %d (capacity %d)\n", maxHi, 1<<childrenBitsHi-1)
431 fmt.Fprintf(w, "// max lo %d (capacity %d)\n", maxLo, 1<<childrenBitsLo-1)
440 // nodesIndex and childrenIndex are the index of this node in the nodes
441 // and the index of its children offset/length in the children arrays.
442 nodesIndex, childrenIndex int
443 // firstChild is the index of this node's first child, or zero if this
444 // node has no children.
446 // children are the node's children, in strictly increasing node label order.
450 func (n *node) walk(w io.Writer, f func(w1 io.Writer, n1 *node) error) error {
451 if err := f(w, n); err != nil {
454 for _, c := range n.children {
455 if err := c.walk(w, f); err != nil {
462 // child returns the child of n with the given label. The child is created if
463 // it did not exist beforehand.
464 func (n *node) child(label string) *node {
465 for _, c := range n.children {
466 if c.label == label {
472 nodeType: nodeTypeParentOnly,
475 n.children = append(n.children, c)
476 sort.Sort(byLabel(n.children))
482 func (b byLabel) Len() int { return len(b) }
483 func (b byLabel) Swap(i, j int) { b[i], b[j] = b[j], b[i] }
484 func (b byLabel) Less(i, j int) bool { return b[i].label < b[j].label }
486 var nextNodesIndex int
488 // childrenEncoding are the encoded entries in the generated children array.
489 // All these pre-defined entries have no children.
490 var childrenEncoding = []uint32{
491 0 << (childrenBitsLo + childrenBitsHi), // Without wildcard bit, nodeTypeNormal.
492 1 << (childrenBitsLo + childrenBitsHi), // Without wildcard bit, nodeTypeException.
493 2 << (childrenBitsLo + childrenBitsHi), // Without wildcard bit, nodeTypeParentOnly.
494 4 << (childrenBitsLo + childrenBitsHi), // With wildcard bit, nodeTypeNormal.
495 5 << (childrenBitsLo + childrenBitsHi), // With wildcard bit, nodeTypeException.
496 6 << (childrenBitsLo + childrenBitsHi), // With wildcard bit, nodeTypeParentOnly.
499 var firstCallToAssignIndexes = true
501 func assignIndexes(w io.Writer, n *node) error {
502 if len(n.children) != 0 {
503 // Assign nodesIndex.
504 n.firstChild = nextNodesIndex
505 for _, c := range n.children {
506 c.nodesIndex = nextNodesIndex
510 // The root node's children is implicit.
511 if firstCallToAssignIndexes {
512 firstCallToAssignIndexes = false
516 // Assign childrenIndex.
517 maxChildren = max(maxChildren, len(childrenEncoding))
518 if len(childrenEncoding) >= 1<<nodesBitsChildren {
519 return fmt.Errorf("children table size %d is too large, or nodeBitsChildren is too small", len(childrenEncoding))
521 n.childrenIndex = len(childrenEncoding)
522 lo := uint32(n.firstChild)
523 hi := lo + uint32(len(n.children))
524 maxLo, maxHi = u32max(maxLo, lo), u32max(maxHi, hi)
525 if lo >= 1<<childrenBitsLo {
526 return fmt.Errorf("children lo %d is too large, or childrenBitsLo is too small", lo)
528 if hi >= 1<<childrenBitsHi {
529 return fmt.Errorf("children hi %d is too large, or childrenBitsHi is too small", hi)
531 enc := hi<<childrenBitsLo | lo
532 enc |= uint32(n.nodeType) << (childrenBitsLo + childrenBitsHi)
534 enc |= 1 << (childrenBitsLo + childrenBitsHi + childrenBitsNodeType)
536 childrenEncoding = append(childrenEncoding, enc)
538 n.childrenIndex = n.nodeType
540 n.childrenIndex += numNodeType
546 func printNode(w io.Writer, n *node) error {
547 for _, c := range n.children {
548 s := "---------------"
549 if len(c.children) != 0 {
550 s = fmt.Sprintf("n0x%04x-n0x%04x", c.firstChild, c.firstChild+len(c.children))
552 encoding := labelEncoding[c.label]
554 encoding |= 1 << (nodesBitsTextLength + nodesBitsTextOffset)
556 encoding |= uint32(c.childrenIndex) << (nodesBitsTextLength + nodesBitsTextOffset + nodesBitsICANN)
558 fmt.Fprintf(w, "0x%08x, // n0x%04x c0x%04x (%s)%s %s %s %s\n",
559 encoding, c.nodesIndex, c.childrenIndex, s, wildcardStr(c.wildcard),
560 nodeTypeStr(c.nodeType), icannStr(c.icann), c.label,
563 fmt.Fprintf(w, "0x%x,\n", encoding)
569 func printNodeLabel(w io.Writer, n *node) error {
570 for _, c := range n.children {
571 fmt.Fprintf(w, "%q,\n", c.label)
576 func icannStr(icann bool) string {
583 func wildcardStr(wildcard bool) string {
590 // combineText combines all the strings in labelsList to form one giant string.
591 // Overlapping strings will be merged: "arpa" and "parliament" could yield
593 func combineText(labelsList []string) string {
595 for _, s := range labelsList {
596 beforeLength += len(s)
599 text := crush(removeSubstrings(labelsList))
601 fmt.Fprintf(os.Stderr, "crushed %d bytes to become %d bytes\n", beforeLength, len(text))
606 type byLength []string
608 func (s byLength) Len() int { return len(s) }
609 func (s byLength) Swap(i, j int) { s[i], s[j] = s[j], s[i] }
610 func (s byLength) Less(i, j int) bool { return len(s[i]) < len(s[j]) }
612 // removeSubstrings returns a copy of its input with any strings removed
613 // that are substrings of other provided strings.
614 func removeSubstrings(input []string) []string {
615 // Make a copy of input.
616 ss := append(make([]string, 0, len(input)), input...)
617 sort.Sort(byLength(ss))
619 for i, shortString := range ss {
620 // For each string, only consider strings higher than it in sort order, i.e.
621 // of equal length or greater.
622 for _, longString := range ss[i+1:] {
623 if strings.Contains(longString, shortString) {
630 // Remove the empty strings.
632 for len(ss) > 0 && ss[0] == "" {
638 // crush combines a list of strings, taking advantage of overlaps. It returns a
639 // single string that contains each input string as a substring.
640 func crush(ss []string) string {
642 for _, s := range ss {
643 if maxLabelLen < len(s) {
648 for prefixLen := maxLabelLen; prefixLen > 0; prefixLen-- {
649 prefixes := makePrefixMap(ss, prefixLen)
650 for i, s := range ss {
651 if len(s) <= prefixLen {
654 mergeLabel(ss, i, prefixLen, prefixes)
658 return strings.Join(ss, "")
661 // mergeLabel merges the label at ss[i] with the first available matching label
662 // in prefixMap, where the last "prefixLen" characters in ss[i] match the first
663 // "prefixLen" characters in the matching label.
664 // It will merge ss[i] repeatedly until no more matches are available.
665 // All matching labels merged into ss[i] are replaced by "".
666 func mergeLabel(ss []string, i, prefixLen int, prefixes prefixMap) {
668 suffix := s[len(s)-prefixLen:]
669 for _, j := range prefixes[suffix] {
670 // Empty strings mean "already used." Also avoid merging with self.
671 if ss[j] == "" || i == j {
675 fmt.Fprintf(os.Stderr, "%d-length overlap at (%4d,%4d): %q and %q share %q\n",
676 prefixLen, i, j, ss[i], ss[j], suffix)
678 ss[i] += ss[j][prefixLen:]
680 // ss[i] has a new suffix, so merge again if possible.
681 // Note: we only have to merge again at the same prefix length. Shorter
682 // prefix lengths will be handled in the next iteration of crush's for loop.
683 // Can there be matches for longer prefix lengths, introduced by the merge?
684 // I believe that any such matches would by necessity have been eliminated
685 // during substring removal or merged at a higher prefix length. For
686 // instance, in crush("abc", "cde", "bcdef"), combining "abc" and "cde"
687 // would yield "abcde", which could be merged with "bcdef." However, in
688 // practice "cde" would already have been elimintated by removeSubstrings.
689 mergeLabel(ss, i, prefixLen, prefixes)
694 // prefixMap maps from a prefix to a list of strings containing that prefix. The
695 // list of strings is represented as indexes into a slice of strings stored
697 type prefixMap map[string][]int
699 // makePrefixMap constructs a prefixMap from a slice of strings.
700 func makePrefixMap(ss []string, prefixLen int) prefixMap {
701 prefixes := make(prefixMap)
702 for i, s := range ss {
703 // We use < rather than <= because if a label matches on a prefix equal to
704 // its full length, that's actually a substring match handled by
706 if prefixLen < len(s) {
707 prefix := s[:prefixLen]
708 prefixes[prefix] = append(prefixes[prefix], i)