Improve how reference regexps are built

Previous implementation was doing a lot of string -> regexp -> string
conversions

Signed-off-by: Paul Cacheux <paul.cacheux@datadoghq.com>
This commit is contained in:
Paul Cacheux 2022-01-13 16:49:03 +01:00
parent bb1fb61445
commit 1c89ce5fc1

View file

@ -3,145 +3,157 @@ package reference
import "regexp" import "regexp"
var ( var (
// alphaNumericRegexp defines the alpha numeric atom, typically a // alphaNumeric defines the alpha numeric atom, typically a
// component of names. This only allows lower case characters and digits. // component of names. This only allows lower case characters and digits.
alphaNumericRegexp = match(`[a-z0-9]+`) alphaNumeric = `[a-z0-9]+`
// separatorRegexp defines the separators allowed to be embedded in name // separator defines the separators allowed to be embedded in name
// components. This allow one period, one or two underscore and multiple // components. This allow one period, one or two underscore and multiple
// dashes. Repeated dashes and underscores are intentionally treated // dashes. Repeated dashes and underscores are intentionally treated
// differently. In order to support valid hostnames as name components, // differently. In order to support valid hostnames as name components,
// supporting repeated dash was added. Additionally double underscore is // supporting repeated dash was added. Additionally double underscore is
// now allowed as a separator to loosen the restriction for previously // now allowed as a separator to loosen the restriction for previously
// supported names. // supported names.
separatorRegexp = match(`(?:[._]|__|[-]*)`) separator = `(?:[._]|__|[-]*)`
// nameComponentRegexp restricts registry path component names to start // nameComponent restricts registry path component names to start
// with at least one letter or number, with following parts able to be // with at least one letter or number, with following parts able to be
// separated by one period, one or two underscore and multiple dashes. // separated by one period, one or two underscore and multiple dashes.
nameComponentRegexp = expression( nameComponent = expression(
alphaNumericRegexp, alphaNumeric,
optional(repeated(separatorRegexp, alphaNumericRegexp))) optional(repeated(separator, alphaNumeric)))
// domainComponentRegexp restricts the registry domain component of a // domainComponent restricts the registry domain component of a
// repository name to start with a component as defined by DomainRegexp // repository name to start with a component as defined by DomainRegexp
// and followed by an optional port. // and followed by an optional port.
domainComponentRegexp = match(`(?:[a-zA-Z0-9]|[a-zA-Z0-9][a-zA-Z0-9-]*[a-zA-Z0-9])`) domainComponent = `(?:[a-zA-Z0-9]|[a-zA-Z0-9][a-zA-Z0-9-]*[a-zA-Z0-9])`
domain = expression(
domainComponent,
optional(repeated(literal(`.`), domainComponent)),
optional(literal(`:`), `[0-9]+`))
// DomainRegexp defines the structure of potential domain components // DomainRegexp defines the structure of potential domain components
// that may be part of image names. This is purposely a subset of what is // that may be part of image names. This is purposely a subset of what is
// allowed by DNS to ensure backwards compatibility with Docker image // allowed by DNS to ensure backwards compatibility with Docker image
// names. // names.
DomainRegexp = expression( DomainRegexp = re(domain)
domainComponentRegexp,
optional(repeated(literal(`.`), domainComponentRegexp)),
optional(literal(`:`), match(`[0-9]+`)))
tag = `[\w][\w.-]{0,127}`
// TagRegexp matches valid tag names. From docker/docker:graph/tags.go. // TagRegexp matches valid tag names. From docker/docker:graph/tags.go.
TagRegexp = match(`[\w][\w.-]{0,127}`) TagRegexp = re(tag)
anchoredTag = anchored(tag)
// anchoredTagRegexp matches valid tag names, anchored at the start and // anchoredTagRegexp matches valid tag names, anchored at the start and
// end of the matched string. // end of the matched string.
anchoredTagRegexp = anchored(TagRegexp) anchoredTagRegexp = re(anchoredTag)
digestPat = `[A-Za-z][A-Za-z0-9]*(?:[-_+.][A-Za-z][A-Za-z0-9]*)*[:][[:xdigit:]]{32,}`
// DigestRegexp matches valid digests. // DigestRegexp matches valid digests.
DigestRegexp = match(`[A-Za-z][A-Za-z0-9]*(?:[-_+.][A-Za-z][A-Za-z0-9]*)*[:][[:xdigit:]]{32,}`) DigestRegexp = re(digestPat)
anchoredDigest = anchored(digestPat)
// anchoredDigestRegexp matches valid digests, anchored at the start and // anchoredDigestRegexp matches valid digests, anchored at the start and
// end of the matched string. // end of the matched string.
anchoredDigestRegexp = anchored(DigestRegexp) anchoredDigestRegexp = re(anchoredDigest)
namePat = expression(
optional(domain, literal(`/`)),
nameComponent,
optional(repeated(literal(`/`), nameComponent)))
// NameRegexp is the format for the name component of references. The // NameRegexp is the format for the name component of references. The
// regexp has capturing groups for the domain and name part omitting // regexp has capturing groups for the domain and name part omitting
// the separating forward slash from either. // the separating forward slash from either.
NameRegexp = expression( NameRegexp = re(namePat)
optional(DomainRegexp, literal(`/`)),
nameComponentRegexp,
optional(repeated(literal(`/`), nameComponentRegexp)))
anchoredName = anchored(
optional(capture(domain), literal(`/`)),
capture(nameComponent,
optional(repeated(literal(`/`), nameComponent))))
// anchoredNameRegexp is used to parse a name value, capturing the // anchoredNameRegexp is used to parse a name value, capturing the
// domain and trailing components. // domain and trailing components.
anchoredNameRegexp = anchored( anchoredNameRegexp = re(anchoredName)
optional(capture(DomainRegexp), literal(`/`)),
capture(nameComponentRegexp,
optional(repeated(literal(`/`), nameComponentRegexp))))
referencePat = anchored(capture(namePat),
optional(literal(":"), capture(tag)),
optional(literal("@"), capture(digestPat)))
// ReferenceRegexp is the full supported format of a reference. The regexp // ReferenceRegexp is the full supported format of a reference. The regexp
// is anchored and has capturing groups for name, tag, and digest // is anchored and has capturing groups for name, tag, and digest
// components. // components.
ReferenceRegexp = anchored(capture(NameRegexp), ReferenceRegexp = re(referencePat)
optional(literal(":"), capture(TagRegexp)),
optional(literal("@"), capture(DigestRegexp)))
identifier = `([a-f0-9]{64})`
// IdentifierRegexp is the format for string identifier used as a // IdentifierRegexp is the format for string identifier used as a
// content addressable identifier using sha256. These identifiers // content addressable identifier using sha256. These identifiers
// are like digests without the algorithm, since sha256 is used. // are like digests without the algorithm, since sha256 is used.
IdentifierRegexp = match(`([a-f0-9]{64})`) IdentifierRegexp = re(identifier)
shortIdentifier = `([a-f0-9]{6,64})`
// ShortIdentifierRegexp is the format used to represent a prefix // ShortIdentifierRegexp is the format used to represent a prefix
// of an identifier. A prefix may be used to match a sha256 identifier // of an identifier. A prefix may be used to match a sha256 identifier
// within a list of trusted identifiers. // within a list of trusted identifiers.
ShortIdentifierRegexp = match(`([a-f0-9]{6,64})`) ShortIdentifierRegexp = re(shortIdentifier)
anchoredIdentifier = anchored(identifier)
// anchoredIdentifierRegexp is used to check or match an // anchoredIdentifierRegexp is used to check or match an
// identifier value, anchored at start and end of string. // identifier value, anchored at start and end of string.
anchoredIdentifierRegexp = anchored(IdentifierRegexp) anchoredIdentifierRegexp = re(anchoredIdentifier)
anchoredShortIdentifier = anchored(shortIdentifier)
// anchoredShortIdentifierRegexp is used to check if a value // anchoredShortIdentifierRegexp is used to check if a value
// is a possible identifier prefix, anchored at start and end // is a possible identifier prefix, anchored at start and end
// of string. // of string.
anchoredShortIdentifierRegexp = anchored(ShortIdentifierRegexp) anchoredShortIdentifierRegexp = re(anchoredShortIdentifier)
) )
// match compiles the string to a regular expression. // re compiles the string to a regular expression.
var match = regexp.MustCompile var re = regexp.MustCompile
// literal compiles s into a literal regular expression, escaping any regexp // literal compiles s into a literal regular expression, escaping any regexp
// reserved characters. // reserved characters.
func literal(s string) *regexp.Regexp { func literal(s string) string {
re := match(regexp.QuoteMeta(s)) re := re(regexp.QuoteMeta(s))
if _, complete := re.LiteralPrefix(); !complete { if _, complete := re.LiteralPrefix(); !complete {
panic("must be a literal") panic("must be a literal")
} }
return re return re.String()
} }
// expression defines a full expression, where each regular expression must // expression defines a full expression, where each regular expression must
// follow the previous. // follow the previous.
func expression(res ...*regexp.Regexp) *regexp.Regexp { func expression(res ...string) string {
var s string var s string
for _, re := range res { for _, re := range res {
s += re.String() s += re
} }
return match(s) return s
} }
// optional wraps the expression in a non-capturing group and makes the // optional wraps the expression in a non-capturing group and makes the
// production optional. // production optional.
func optional(res ...*regexp.Regexp) *regexp.Regexp { func optional(res ...string) string {
return match(group(expression(res...)).String() + `?`) return group(expression(res...)) + `?`
} }
// repeated wraps the regexp in a non-capturing group to get one or more // repeated wraps the regexp in a non-capturing group to get one or more
// matches. // matches.
func repeated(res ...*regexp.Regexp) *regexp.Regexp { func repeated(res ...string) string {
return match(group(expression(res...)).String() + `+`) return group(expression(res...)) + `+`
} }
// group wraps the regexp in a non-capturing group. // group wraps the regexp in a non-capturing group.
func group(res ...*regexp.Regexp) *regexp.Regexp { func group(res ...string) string {
return match(`(?:` + expression(res...).String() + `)`) return `(?:` + expression(res...) + `)`
} }
// capture wraps the expression in a capturing group. // capture wraps the expression in a capturing group.
func capture(res ...*regexp.Regexp) *regexp.Regexp { func capture(res ...string) string {
return match(`(` + expression(res...).String() + `)`) return `(` + expression(res...) + `)`
} }
// anchored anchors the regular expression by adding start and end delimiters. // anchored anchors the regular expression by adding start and end delimiters.
func anchored(res ...*regexp.Regexp) *regexp.Regexp { func anchored(res ...string) string {
return match(`^` + expression(res...).String() + `$`) return `^` + expression(res...) + `$`
} }