a685e3fc98
Vndr has a simpler configuration and allows pointing to forked packages. Additionally other docker projects are now using vndr making vendoring in distribution more consistent. Updates letsencrypt to use fork. No longer uses sub-vendored packages. Signed-off-by: Derek McGowan <derek@mcgstyle.net> (github: dmcgowan)
663 lines
19 KiB
Go
663 lines
19 KiB
Go
// Copyright 2012 The Go Authors. All rights reserved.
|
|
// Use of this source code is governed by a BSD-style
|
|
// license that can be found in the LICENSE file.
|
|
|
|
// +build ignore
|
|
|
|
package main
|
|
|
|
// This program generates table.go and table_test.go.
|
|
// Invoke as:
|
|
//
|
|
// go run gen.go -version "xxx" >table.go
|
|
// go run gen.go -version "xxx" -test >table_test.go
|
|
//
|
|
// Pass -v to print verbose progress information.
|
|
//
|
|
// The version is derived from information found at
|
|
// https://github.com/publicsuffix/list/commits/master/public_suffix_list.dat
|
|
//
|
|
// To fetch a particular git revision, such as 5c70ccd250, pass
|
|
// -url "https://raw.githubusercontent.com/publicsuffix/list/5c70ccd250/public_suffix_list.dat"
|
|
|
|
import (
|
|
"bufio"
|
|
"bytes"
|
|
"flag"
|
|
"fmt"
|
|
"go/format"
|
|
"io"
|
|
"net/http"
|
|
"os"
|
|
"regexp"
|
|
"sort"
|
|
"strings"
|
|
|
|
"golang.org/x/net/idna"
|
|
)
|
|
|
|
const (
|
|
// These sum of these four values must be no greater than 32.
|
|
nodesBitsChildren = 9
|
|
nodesBitsICANN = 1
|
|
nodesBitsTextOffset = 15
|
|
nodesBitsTextLength = 6
|
|
|
|
// These sum of these four values must be no greater than 32.
|
|
childrenBitsWildcard = 1
|
|
childrenBitsNodeType = 2
|
|
childrenBitsHi = 14
|
|
childrenBitsLo = 14
|
|
)
|
|
|
|
var (
|
|
maxChildren int
|
|
maxTextOffset int
|
|
maxTextLength int
|
|
maxHi uint32
|
|
maxLo uint32
|
|
)
|
|
|
|
func max(a, b int) int {
|
|
if a < b {
|
|
return b
|
|
}
|
|
return a
|
|
}
|
|
|
|
func u32max(a, b uint32) uint32 {
|
|
if a < b {
|
|
return b
|
|
}
|
|
return a
|
|
}
|
|
|
|
const (
|
|
nodeTypeNormal = 0
|
|
nodeTypeException = 1
|
|
nodeTypeParentOnly = 2
|
|
numNodeType = 3
|
|
)
|
|
|
|
func nodeTypeStr(n int) string {
|
|
switch n {
|
|
case nodeTypeNormal:
|
|
return "+"
|
|
case nodeTypeException:
|
|
return "!"
|
|
case nodeTypeParentOnly:
|
|
return "o"
|
|
}
|
|
panic("unreachable")
|
|
}
|
|
|
|
var (
|
|
labelEncoding = map[string]uint32{}
|
|
labelsList = []string{}
|
|
labelsMap = map[string]bool{}
|
|
rules = []string{}
|
|
|
|
// validSuffix is used to check that the entries in the public suffix list
|
|
// are in canonical form (after Punycode encoding). Specifically, capital
|
|
// letters are not allowed.
|
|
validSuffix = regexp.MustCompile(`^[a-z0-9_\!\*\-\.]+$`)
|
|
|
|
subset = flag.Bool("subset", false, "generate only a subset of the full table, for debugging")
|
|
url = flag.String("url",
|
|
"https://publicsuffix.org/list/effective_tld_names.dat",
|
|
"URL of the publicsuffix.org list. If empty, stdin is read instead")
|
|
v = flag.Bool("v", false, "verbose output (to stderr)")
|
|
version = flag.String("version", "", "the effective_tld_names.dat version")
|
|
test = flag.Bool("test", false, "generate table_test.go")
|
|
)
|
|
|
|
func main() {
|
|
if err := main1(); err != nil {
|
|
fmt.Fprintln(os.Stderr, err)
|
|
os.Exit(1)
|
|
}
|
|
}
|
|
|
|
func main1() error {
|
|
flag.Parse()
|
|
if nodesBitsTextLength+nodesBitsTextOffset+nodesBitsICANN+nodesBitsChildren > 32 {
|
|
return fmt.Errorf("not enough bits to encode the nodes table")
|
|
}
|
|
if childrenBitsLo+childrenBitsHi+childrenBitsNodeType+childrenBitsWildcard > 32 {
|
|
return fmt.Errorf("not enough bits to encode the children table")
|
|
}
|
|
if *version == "" {
|
|
return fmt.Errorf("-version was not specified")
|
|
}
|
|
var r io.Reader = os.Stdin
|
|
if *url != "" {
|
|
res, err := http.Get(*url)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if res.StatusCode != http.StatusOK {
|
|
return fmt.Errorf("bad GET status for %s: %d", *url, res.Status)
|
|
}
|
|
r = res.Body
|
|
defer res.Body.Close()
|
|
}
|
|
|
|
var root node
|
|
icann := false
|
|
buf := new(bytes.Buffer)
|
|
br := bufio.NewReader(r)
|
|
for {
|
|
s, err := br.ReadString('\n')
|
|
if err != nil {
|
|
if err == io.EOF {
|
|
break
|
|
}
|
|
return err
|
|
}
|
|
s = strings.TrimSpace(s)
|
|
if strings.Contains(s, "BEGIN ICANN DOMAINS") {
|
|
icann = true
|
|
continue
|
|
}
|
|
if strings.Contains(s, "END ICANN DOMAINS") {
|
|
icann = false
|
|
continue
|
|
}
|
|
if s == "" || strings.HasPrefix(s, "//") {
|
|
continue
|
|
}
|
|
s, err = idna.ToASCII(s)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if !validSuffix.MatchString(s) {
|
|
return fmt.Errorf("bad publicsuffix.org list data: %q", s)
|
|
}
|
|
|
|
if *subset {
|
|
switch {
|
|
case s == "ac.jp" || strings.HasSuffix(s, ".ac.jp"):
|
|
case s == "ak.us" || strings.HasSuffix(s, ".ak.us"):
|
|
case s == "ao" || strings.HasSuffix(s, ".ao"):
|
|
case s == "ar" || strings.HasSuffix(s, ".ar"):
|
|
case s == "arpa" || strings.HasSuffix(s, ".arpa"):
|
|
case s == "cy" || strings.HasSuffix(s, ".cy"):
|
|
case s == "dyndns.org" || strings.HasSuffix(s, ".dyndns.org"):
|
|
case s == "jp":
|
|
case s == "kobe.jp" || strings.HasSuffix(s, ".kobe.jp"):
|
|
case s == "kyoto.jp" || strings.HasSuffix(s, ".kyoto.jp"):
|
|
case s == "om" || strings.HasSuffix(s, ".om"):
|
|
case s == "uk" || strings.HasSuffix(s, ".uk"):
|
|
case s == "uk.com" || strings.HasSuffix(s, ".uk.com"):
|
|
case s == "tw" || strings.HasSuffix(s, ".tw"):
|
|
case s == "zw" || strings.HasSuffix(s, ".zw"):
|
|
case s == "xn--p1ai" || strings.HasSuffix(s, ".xn--p1ai"):
|
|
// xn--p1ai is Russian-Cyrillic "рф".
|
|
default:
|
|
continue
|
|
}
|
|
}
|
|
|
|
rules = append(rules, s)
|
|
|
|
nt, wildcard := nodeTypeNormal, false
|
|
switch {
|
|
case strings.HasPrefix(s, "*."):
|
|
s, nt = s[2:], nodeTypeParentOnly
|
|
wildcard = true
|
|
case strings.HasPrefix(s, "!"):
|
|
s, nt = s[1:], nodeTypeException
|
|
}
|
|
labels := strings.Split(s, ".")
|
|
for n, i := &root, len(labels)-1; i >= 0; i-- {
|
|
label := labels[i]
|
|
n = n.child(label)
|
|
if i == 0 {
|
|
if nt != nodeTypeParentOnly && n.nodeType == nodeTypeParentOnly {
|
|
n.nodeType = nt
|
|
}
|
|
n.icann = n.icann && icann
|
|
n.wildcard = n.wildcard || wildcard
|
|
}
|
|
labelsMap[label] = true
|
|
}
|
|
}
|
|
labelsList = make([]string, 0, len(labelsMap))
|
|
for label := range labelsMap {
|
|
labelsList = append(labelsList, label)
|
|
}
|
|
sort.Strings(labelsList)
|
|
|
|
p := printReal
|
|
if *test {
|
|
p = printTest
|
|
}
|
|
if err := p(buf, &root); err != nil {
|
|
return err
|
|
}
|
|
|
|
b, err := format.Source(buf.Bytes())
|
|
if err != nil {
|
|
return err
|
|
}
|
|
_, err = os.Stdout.Write(b)
|
|
return err
|
|
}
|
|
|
|
func printTest(w io.Writer, n *node) error {
|
|
fmt.Fprintf(w, "// generated by go run gen.go; DO NOT EDIT\n\n")
|
|
fmt.Fprintf(w, "package publicsuffix\n\nvar rules = [...]string{\n")
|
|
for _, rule := range rules {
|
|
fmt.Fprintf(w, "%q,\n", rule)
|
|
}
|
|
fmt.Fprintf(w, "}\n\nvar nodeLabels = [...]string{\n")
|
|
if err := n.walk(w, printNodeLabel); err != nil {
|
|
return err
|
|
}
|
|
fmt.Fprintf(w, "}\n")
|
|
return nil
|
|
}
|
|
|
|
func printReal(w io.Writer, n *node) error {
|
|
const header = `// generated by go run gen.go; DO NOT EDIT
|
|
|
|
package publicsuffix
|
|
|
|
const version = %q
|
|
|
|
const (
|
|
nodesBitsChildren = %d
|
|
nodesBitsICANN = %d
|
|
nodesBitsTextOffset = %d
|
|
nodesBitsTextLength = %d
|
|
|
|
childrenBitsWildcard = %d
|
|
childrenBitsNodeType = %d
|
|
childrenBitsHi = %d
|
|
childrenBitsLo = %d
|
|
)
|
|
|
|
const (
|
|
nodeTypeNormal = %d
|
|
nodeTypeException = %d
|
|
nodeTypeParentOnly = %d
|
|
)
|
|
|
|
// numTLD is the number of top level domains.
|
|
const numTLD = %d
|
|
|
|
`
|
|
fmt.Fprintf(w, header, *version,
|
|
nodesBitsChildren, nodesBitsICANN, nodesBitsTextOffset, nodesBitsTextLength,
|
|
childrenBitsWildcard, childrenBitsNodeType, childrenBitsHi, childrenBitsLo,
|
|
nodeTypeNormal, nodeTypeException, nodeTypeParentOnly, len(n.children))
|
|
|
|
text := combineText(labelsList)
|
|
if text == "" {
|
|
return fmt.Errorf("internal error: makeText returned no text")
|
|
}
|
|
for _, label := range labelsList {
|
|
offset, length := strings.Index(text, label), len(label)
|
|
if offset < 0 {
|
|
return fmt.Errorf("internal error: could not find %q in text %q", label, text)
|
|
}
|
|
maxTextOffset, maxTextLength = max(maxTextOffset, offset), max(maxTextLength, length)
|
|
if offset >= 1<<nodesBitsTextOffset {
|
|
return fmt.Errorf("text offset %d is too large, or nodeBitsTextOffset is too small", offset)
|
|
}
|
|
if length >= 1<<nodesBitsTextLength {
|
|
return fmt.Errorf("text length %d is too large, or nodeBitsTextLength is too small", length)
|
|
}
|
|
labelEncoding[label] = uint32(offset)<<nodesBitsTextLength | uint32(length)
|
|
}
|
|
fmt.Fprintf(w, "// Text is the combined text of all labels.\nconst text = ")
|
|
for len(text) > 0 {
|
|
n, plus := len(text), ""
|
|
if n > 64 {
|
|
n, plus = 64, " +"
|
|
}
|
|
fmt.Fprintf(w, "%q%s\n", text[:n], plus)
|
|
text = text[n:]
|
|
}
|
|
|
|
if err := n.walk(w, assignIndexes); err != nil {
|
|
return err
|
|
}
|
|
|
|
fmt.Fprintf(w, `
|
|
|
|
// nodes is the list of nodes. Each node is represented as a uint32, which
|
|
// encodes the node's children, wildcard bit and node type (as an index into
|
|
// the children array), ICANN bit and text.
|
|
//
|
|
// In the //-comment after each node's data, the nodes indexes of the children
|
|
// are formatted as (n0x1234-n0x1256), with * denoting the wildcard bit. The
|
|
// nodeType is printed as + for normal, ! for exception, and o for parent-only
|
|
// nodes that have children but don't match a domain label in their own right.
|
|
// An I denotes an ICANN domain.
|
|
//
|
|
// The layout within the uint32, from MSB to LSB, is:
|
|
// [%2d bits] unused
|
|
// [%2d bits] children index
|
|
// [%2d bits] ICANN bit
|
|
// [%2d bits] text index
|
|
// [%2d bits] text length
|
|
var nodes = [...]uint32{
|
|
`,
|
|
32-nodesBitsChildren-nodesBitsICANN-nodesBitsTextOffset-nodesBitsTextLength,
|
|
nodesBitsChildren, nodesBitsICANN, nodesBitsTextOffset, nodesBitsTextLength)
|
|
if err := n.walk(w, printNode); err != nil {
|
|
return err
|
|
}
|
|
fmt.Fprintf(w, `}
|
|
|
|
// children is the list of nodes' children, the parent's wildcard bit and the
|
|
// parent's node type. If a node has no children then their children index
|
|
// will be in the range [0, 6), depending on the wildcard bit and node type.
|
|
//
|
|
// The layout within the uint32, from MSB to LSB, is:
|
|
// [%2d bits] unused
|
|
// [%2d bits] wildcard bit
|
|
// [%2d bits] node type
|
|
// [%2d bits] high nodes index (exclusive) of children
|
|
// [%2d bits] low nodes index (inclusive) of children
|
|
var children=[...]uint32{
|
|
`,
|
|
32-childrenBitsWildcard-childrenBitsNodeType-childrenBitsHi-childrenBitsLo,
|
|
childrenBitsWildcard, childrenBitsNodeType, childrenBitsHi, childrenBitsLo)
|
|
for i, c := range childrenEncoding {
|
|
s := "---------------"
|
|
lo := c & (1<<childrenBitsLo - 1)
|
|
hi := (c >> childrenBitsLo) & (1<<childrenBitsHi - 1)
|
|
if lo != hi {
|
|
s = fmt.Sprintf("n0x%04x-n0x%04x", lo, hi)
|
|
}
|
|
nodeType := int(c>>(childrenBitsLo+childrenBitsHi)) & (1<<childrenBitsNodeType - 1)
|
|
wildcard := c>>(childrenBitsLo+childrenBitsHi+childrenBitsNodeType) != 0
|
|
fmt.Fprintf(w, "0x%08x, // c0x%04x (%s)%s %s\n",
|
|
c, i, s, wildcardStr(wildcard), nodeTypeStr(nodeType))
|
|
}
|
|
fmt.Fprintf(w, "}\n\n")
|
|
fmt.Fprintf(w, "// max children %d (capacity %d)\n", maxChildren, 1<<nodesBitsChildren-1)
|
|
fmt.Fprintf(w, "// max text offset %d (capacity %d)\n", maxTextOffset, 1<<nodesBitsTextOffset-1)
|
|
fmt.Fprintf(w, "// max text length %d (capacity %d)\n", maxTextLength, 1<<nodesBitsTextLength-1)
|
|
fmt.Fprintf(w, "// max hi %d (capacity %d)\n", maxHi, 1<<childrenBitsHi-1)
|
|
fmt.Fprintf(w, "// max lo %d (capacity %d)\n", maxLo, 1<<childrenBitsLo-1)
|
|
return nil
|
|
}
|
|
|
|
type node struct {
|
|
label string
|
|
nodeType int
|
|
icann bool
|
|
wildcard bool
|
|
// nodesIndex and childrenIndex are the index of this node in the nodes
|
|
// and the index of its children offset/length in the children arrays.
|
|
nodesIndex, childrenIndex int
|
|
// firstChild is the index of this node's first child, or zero if this
|
|
// node has no children.
|
|
firstChild int
|
|
// children are the node's children, in strictly increasing node label order.
|
|
children []*node
|
|
}
|
|
|
|
func (n *node) walk(w io.Writer, f func(w1 io.Writer, n1 *node) error) error {
|
|
if err := f(w, n); err != nil {
|
|
return err
|
|
}
|
|
for _, c := range n.children {
|
|
if err := c.walk(w, f); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// child returns the child of n with the given label. The child is created if
|
|
// it did not exist beforehand.
|
|
func (n *node) child(label string) *node {
|
|
for _, c := range n.children {
|
|
if c.label == label {
|
|
return c
|
|
}
|
|
}
|
|
c := &node{
|
|
label: label,
|
|
nodeType: nodeTypeParentOnly,
|
|
icann: true,
|
|
}
|
|
n.children = append(n.children, c)
|
|
sort.Sort(byLabel(n.children))
|
|
return c
|
|
}
|
|
|
|
type byLabel []*node
|
|
|
|
func (b byLabel) Len() int { return len(b) }
|
|
func (b byLabel) Swap(i, j int) { b[i], b[j] = b[j], b[i] }
|
|
func (b byLabel) Less(i, j int) bool { return b[i].label < b[j].label }
|
|
|
|
var nextNodesIndex int
|
|
|
|
// childrenEncoding are the encoded entries in the generated children array.
|
|
// All these pre-defined entries have no children.
|
|
var childrenEncoding = []uint32{
|
|
0 << (childrenBitsLo + childrenBitsHi), // Without wildcard bit, nodeTypeNormal.
|
|
1 << (childrenBitsLo + childrenBitsHi), // Without wildcard bit, nodeTypeException.
|
|
2 << (childrenBitsLo + childrenBitsHi), // Without wildcard bit, nodeTypeParentOnly.
|
|
4 << (childrenBitsLo + childrenBitsHi), // With wildcard bit, nodeTypeNormal.
|
|
5 << (childrenBitsLo + childrenBitsHi), // With wildcard bit, nodeTypeException.
|
|
6 << (childrenBitsLo + childrenBitsHi), // With wildcard bit, nodeTypeParentOnly.
|
|
}
|
|
|
|
var firstCallToAssignIndexes = true
|
|
|
|
func assignIndexes(w io.Writer, n *node) error {
|
|
if len(n.children) != 0 {
|
|
// Assign nodesIndex.
|
|
n.firstChild = nextNodesIndex
|
|
for _, c := range n.children {
|
|
c.nodesIndex = nextNodesIndex
|
|
nextNodesIndex++
|
|
}
|
|
|
|
// The root node's children is implicit.
|
|
if firstCallToAssignIndexes {
|
|
firstCallToAssignIndexes = false
|
|
return nil
|
|
}
|
|
|
|
// Assign childrenIndex.
|
|
maxChildren = max(maxChildren, len(childrenEncoding))
|
|
if len(childrenEncoding) >= 1<<nodesBitsChildren {
|
|
return fmt.Errorf("children table size %d is too large, or nodeBitsChildren is too small", len(childrenEncoding))
|
|
}
|
|
n.childrenIndex = len(childrenEncoding)
|
|
lo := uint32(n.firstChild)
|
|
hi := lo + uint32(len(n.children))
|
|
maxLo, maxHi = u32max(maxLo, lo), u32max(maxHi, hi)
|
|
if lo >= 1<<childrenBitsLo {
|
|
return fmt.Errorf("children lo %d is too large, or childrenBitsLo is too small", lo)
|
|
}
|
|
if hi >= 1<<childrenBitsHi {
|
|
return fmt.Errorf("children hi %d is too large, or childrenBitsHi is too small", hi)
|
|
}
|
|
enc := hi<<childrenBitsLo | lo
|
|
enc |= uint32(n.nodeType) << (childrenBitsLo + childrenBitsHi)
|
|
if n.wildcard {
|
|
enc |= 1 << (childrenBitsLo + childrenBitsHi + childrenBitsNodeType)
|
|
}
|
|
childrenEncoding = append(childrenEncoding, enc)
|
|
} else {
|
|
n.childrenIndex = n.nodeType
|
|
if n.wildcard {
|
|
n.childrenIndex += numNodeType
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func printNode(w io.Writer, n *node) error {
|
|
for _, c := range n.children {
|
|
s := "---------------"
|
|
if len(c.children) != 0 {
|
|
s = fmt.Sprintf("n0x%04x-n0x%04x", c.firstChild, c.firstChild+len(c.children))
|
|
}
|
|
encoding := labelEncoding[c.label]
|
|
if c.icann {
|
|
encoding |= 1 << (nodesBitsTextLength + nodesBitsTextOffset)
|
|
}
|
|
encoding |= uint32(c.childrenIndex) << (nodesBitsTextLength + nodesBitsTextOffset + nodesBitsICANN)
|
|
fmt.Fprintf(w, "0x%08x, // n0x%04x c0x%04x (%s)%s %s %s %s\n",
|
|
encoding, c.nodesIndex, c.childrenIndex, s, wildcardStr(c.wildcard),
|
|
nodeTypeStr(c.nodeType), icannStr(c.icann), c.label,
|
|
)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func printNodeLabel(w io.Writer, n *node) error {
|
|
for _, c := range n.children {
|
|
fmt.Fprintf(w, "%q,\n", c.label)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func icannStr(icann bool) string {
|
|
if icann {
|
|
return "I"
|
|
}
|
|
return " "
|
|
}
|
|
|
|
func wildcardStr(wildcard bool) string {
|
|
if wildcard {
|
|
return "*"
|
|
}
|
|
return " "
|
|
}
|
|
|
|
// combineText combines all the strings in labelsList to form one giant string.
|
|
// Overlapping strings will be merged: "arpa" and "parliament" could yield
|
|
// "arparliament".
|
|
func combineText(labelsList []string) string {
|
|
beforeLength := 0
|
|
for _, s := range labelsList {
|
|
beforeLength += len(s)
|
|
}
|
|
|
|
text := crush(removeSubstrings(labelsList))
|
|
if *v {
|
|
fmt.Fprintf(os.Stderr, "crushed %d bytes to become %d bytes\n", beforeLength, len(text))
|
|
}
|
|
return text
|
|
}
|
|
|
|
type byLength []string
|
|
|
|
func (s byLength) Len() int { return len(s) }
|
|
func (s byLength) Swap(i, j int) { s[i], s[j] = s[j], s[i] }
|
|
func (s byLength) Less(i, j int) bool { return len(s[i]) < len(s[j]) }
|
|
|
|
// removeSubstrings returns a copy of its input with any strings removed
|
|
// that are substrings of other provided strings.
|
|
func removeSubstrings(input []string) []string {
|
|
// Make a copy of input.
|
|
ss := append(make([]string, 0, len(input)), input...)
|
|
sort.Sort(byLength(ss))
|
|
|
|
for i, shortString := range ss {
|
|
// For each string, only consider strings higher than it in sort order, i.e.
|
|
// of equal length or greater.
|
|
for _, longString := range ss[i+1:] {
|
|
if strings.Contains(longString, shortString) {
|
|
ss[i] = ""
|
|
break
|
|
}
|
|
}
|
|
}
|
|
|
|
// Remove the empty strings.
|
|
sort.Strings(ss)
|
|
for len(ss) > 0 && ss[0] == "" {
|
|
ss = ss[1:]
|
|
}
|
|
return ss
|
|
}
|
|
|
|
// crush combines a list of strings, taking advantage of overlaps. It returns a
|
|
// single string that contains each input string as a substring.
|
|
func crush(ss []string) string {
|
|
maxLabelLen := 0
|
|
for _, s := range ss {
|
|
if maxLabelLen < len(s) {
|
|
maxLabelLen = len(s)
|
|
}
|
|
}
|
|
|
|
for prefixLen := maxLabelLen; prefixLen > 0; prefixLen-- {
|
|
prefixes := makePrefixMap(ss, prefixLen)
|
|
for i, s := range ss {
|
|
if len(s) <= prefixLen {
|
|
continue
|
|
}
|
|
mergeLabel(ss, i, prefixLen, prefixes)
|
|
}
|
|
}
|
|
|
|
return strings.Join(ss, "")
|
|
}
|
|
|
|
// mergeLabel merges the label at ss[i] with the first available matching label
|
|
// in prefixMap, where the last "prefixLen" characters in ss[i] match the first
|
|
// "prefixLen" characters in the matching label.
|
|
// It will merge ss[i] repeatedly until no more matches are available.
|
|
// All matching labels merged into ss[i] are replaced by "".
|
|
func mergeLabel(ss []string, i, prefixLen int, prefixes prefixMap) {
|
|
s := ss[i]
|
|
suffix := s[len(s)-prefixLen:]
|
|
for _, j := range prefixes[suffix] {
|
|
// Empty strings mean "already used." Also avoid merging with self.
|
|
if ss[j] == "" || i == j {
|
|
continue
|
|
}
|
|
if *v {
|
|
fmt.Fprintf(os.Stderr, "%d-length overlap at (%4d,%4d): %q and %q share %q\n",
|
|
prefixLen, i, j, ss[i], ss[j], suffix)
|
|
}
|
|
ss[i] += ss[j][prefixLen:]
|
|
ss[j] = ""
|
|
// ss[i] has a new suffix, so merge again if possible.
|
|
// Note: we only have to merge again at the same prefix length. Shorter
|
|
// prefix lengths will be handled in the next iteration of crush's for loop.
|
|
// Can there be matches for longer prefix lengths, introduced by the merge?
|
|
// I believe that any such matches would by necessity have been eliminated
|
|
// during substring removal or merged at a higher prefix length. For
|
|
// instance, in crush("abc", "cde", "bcdef"), combining "abc" and "cde"
|
|
// would yield "abcde", which could be merged with "bcdef." However, in
|
|
// practice "cde" would already have been elimintated by removeSubstrings.
|
|
mergeLabel(ss, i, prefixLen, prefixes)
|
|
return
|
|
}
|
|
}
|
|
|
|
// prefixMap maps from a prefix to a list of strings containing that prefix. The
|
|
// list of strings is represented as indexes into a slice of strings stored
|
|
// elsewhere.
|
|
type prefixMap map[string][]int
|
|
|
|
// makePrefixMap constructs a prefixMap from a slice of strings.
|
|
func makePrefixMap(ss []string, prefixLen int) prefixMap {
|
|
prefixes := make(prefixMap)
|
|
for i, s := range ss {
|
|
// We use < rather than <= because if a label matches on a prefix equal to
|
|
// its full length, that's actually a substring match handled by
|
|
// removeSubstrings.
|
|
if prefixLen < len(s) {
|
|
prefix := s[:prefixLen]
|
|
prefixes[prefix] = append(prefixes[prefix], i)
|
|
}
|
|
}
|
|
|
|
return prefixes
|
|
}
|