From 1c89ce5fc1cec85e0726f598bccc12b873d04de3 Mon Sep 17 00:00:00 2001 From: Paul Cacheux Date: Thu, 13 Jan 2022 16:49:03 +0100 Subject: [PATCH] Improve how reference regexps are built Previous implementation was doing a lot of string -> regexp -> string conversions Signed-off-by: Paul Cacheux --- reference/regexp.go | 114 ++++++++++++++++++++++++-------------------- 1 file changed, 63 insertions(+), 51 deletions(-) diff --git a/reference/regexp.go b/reference/regexp.go index 78e2f9170..b738cdf84 100644 --- a/reference/regexp.go +++ b/reference/regexp.go @@ -3,145 +3,157 @@ package reference import "regexp" var ( - // alphaNumericRegexp defines the alpha numeric atom, typically a + // alphaNumeric defines the alpha numeric atom, typically a // component of names. This only allows lower case characters and digits. - alphaNumericRegexp = match(`[a-z0-9]+`) + alphaNumeric = `[a-z0-9]+` - // separatorRegexp defines the separators allowed to be embedded in name + // separator defines the separators allowed to be embedded in name // components. This allow one period, one or two underscore and multiple // dashes. Repeated dashes and underscores are intentionally treated // differently. In order to support valid hostnames as name components, // supporting repeated dash was added. Additionally double underscore is // now allowed as a separator to loosen the restriction for previously // supported names. - separatorRegexp = match(`(?:[._]|__|[-]*)`) + separator = `(?:[._]|__|[-]*)` - // nameComponentRegexp restricts registry path component names to start + // nameComponent restricts registry path component names to start // with at least one letter or number, with following parts able to be // separated by one period, one or two underscore and multiple dashes. - nameComponentRegexp = expression( - alphaNumericRegexp, - optional(repeated(separatorRegexp, alphaNumericRegexp))) + nameComponent = expression( + alphaNumeric, + optional(repeated(separator, alphaNumeric))) - // domainComponentRegexp restricts the registry domain component of a + // domainComponent restricts the registry domain component of a // repository name to start with a component as defined by DomainRegexp // and followed by an optional port. - domainComponentRegexp = match(`(?:[a-zA-Z0-9]|[a-zA-Z0-9][a-zA-Z0-9-]*[a-zA-Z0-9])`) + domainComponent = `(?:[a-zA-Z0-9]|[a-zA-Z0-9][a-zA-Z0-9-]*[a-zA-Z0-9])` + domain = expression( + domainComponent, + optional(repeated(literal(`.`), domainComponent)), + optional(literal(`:`), `[0-9]+`)) // DomainRegexp defines the structure of potential domain components // that may be part of image names. This is purposely a subset of what is // allowed by DNS to ensure backwards compatibility with Docker image // names. - DomainRegexp = expression( - domainComponentRegexp, - optional(repeated(literal(`.`), domainComponentRegexp)), - optional(literal(`:`), match(`[0-9]+`))) + DomainRegexp = re(domain) + tag = `[\w][\w.-]{0,127}` // TagRegexp matches valid tag names. From docker/docker:graph/tags.go. - TagRegexp = match(`[\w][\w.-]{0,127}`) + TagRegexp = re(tag) + anchoredTag = anchored(tag) // anchoredTagRegexp matches valid tag names, anchored at the start and // end of the matched string. - anchoredTagRegexp = anchored(TagRegexp) + anchoredTagRegexp = re(anchoredTag) + digestPat = `[A-Za-z][A-Za-z0-9]*(?:[-_+.][A-Za-z][A-Za-z0-9]*)*[:][[:xdigit:]]{32,}` // DigestRegexp matches valid digests. - DigestRegexp = match(`[A-Za-z][A-Za-z0-9]*(?:[-_+.][A-Za-z][A-Za-z0-9]*)*[:][[:xdigit:]]{32,}`) + DigestRegexp = re(digestPat) + anchoredDigest = anchored(digestPat) // anchoredDigestRegexp matches valid digests, anchored at the start and // end of the matched string. - anchoredDigestRegexp = anchored(DigestRegexp) + anchoredDigestRegexp = re(anchoredDigest) + namePat = expression( + optional(domain, literal(`/`)), + nameComponent, + optional(repeated(literal(`/`), nameComponent))) // NameRegexp is the format for the name component of references. The // regexp has capturing groups for the domain and name part omitting // the separating forward slash from either. - NameRegexp = expression( - optional(DomainRegexp, literal(`/`)), - nameComponentRegexp, - optional(repeated(literal(`/`), nameComponentRegexp))) + NameRegexp = re(namePat) + anchoredName = anchored( + optional(capture(domain), literal(`/`)), + capture(nameComponent, + optional(repeated(literal(`/`), nameComponent)))) // anchoredNameRegexp is used to parse a name value, capturing the // domain and trailing components. - anchoredNameRegexp = anchored( - optional(capture(DomainRegexp), literal(`/`)), - capture(nameComponentRegexp, - optional(repeated(literal(`/`), nameComponentRegexp)))) + anchoredNameRegexp = re(anchoredName) + referencePat = anchored(capture(namePat), + optional(literal(":"), capture(tag)), + optional(literal("@"), capture(digestPat))) // ReferenceRegexp is the full supported format of a reference. The regexp // is anchored and has capturing groups for name, tag, and digest // components. - ReferenceRegexp = anchored(capture(NameRegexp), - optional(literal(":"), capture(TagRegexp)), - optional(literal("@"), capture(DigestRegexp))) + ReferenceRegexp = re(referencePat) + identifier = `([a-f0-9]{64})` // IdentifierRegexp is the format for string identifier used as a // content addressable identifier using sha256. These identifiers // are like digests without the algorithm, since sha256 is used. - IdentifierRegexp = match(`([a-f0-9]{64})`) + IdentifierRegexp = re(identifier) + shortIdentifier = `([a-f0-9]{6,64})` // ShortIdentifierRegexp is the format used to represent a prefix // of an identifier. A prefix may be used to match a sha256 identifier // within a list of trusted identifiers. - ShortIdentifierRegexp = match(`([a-f0-9]{6,64})`) + ShortIdentifierRegexp = re(shortIdentifier) + anchoredIdentifier = anchored(identifier) // anchoredIdentifierRegexp is used to check or match an // identifier value, anchored at start and end of string. - anchoredIdentifierRegexp = anchored(IdentifierRegexp) + anchoredIdentifierRegexp = re(anchoredIdentifier) + anchoredShortIdentifier = anchored(shortIdentifier) // anchoredShortIdentifierRegexp is used to check if a value // is a possible identifier prefix, anchored at start and end // of string. - anchoredShortIdentifierRegexp = anchored(ShortIdentifierRegexp) + anchoredShortIdentifierRegexp = re(anchoredShortIdentifier) ) -// match compiles the string to a regular expression. -var match = regexp.MustCompile +// re compiles the string to a regular expression. +var re = regexp.MustCompile // literal compiles s into a literal regular expression, escaping any regexp // reserved characters. -func literal(s string) *regexp.Regexp { - re := match(regexp.QuoteMeta(s)) +func literal(s string) string { + re := re(regexp.QuoteMeta(s)) if _, complete := re.LiteralPrefix(); !complete { panic("must be a literal") } - return re + return re.String() } // expression defines a full expression, where each regular expression must // follow the previous. -func expression(res ...*regexp.Regexp) *regexp.Regexp { +func expression(res ...string) string { var s string for _, re := range res { - s += re.String() + s += re } - return match(s) + return s } // optional wraps the expression in a non-capturing group and makes the // production optional. -func optional(res ...*regexp.Regexp) *regexp.Regexp { - return match(group(expression(res...)).String() + `?`) +func optional(res ...string) string { + return group(expression(res...)) + `?` } // repeated wraps the regexp in a non-capturing group to get one or more // matches. -func repeated(res ...*regexp.Regexp) *regexp.Regexp { - return match(group(expression(res...)).String() + `+`) +func repeated(res ...string) string { + return group(expression(res...)) + `+` } // group wraps the regexp in a non-capturing group. -func group(res ...*regexp.Regexp) *regexp.Regexp { - return match(`(?:` + expression(res...).String() + `)`) +func group(res ...string) string { + return `(?:` + expression(res...) + `)` } // capture wraps the expression in a capturing group. -func capture(res ...*regexp.Regexp) *regexp.Regexp { - return match(`(` + expression(res...).String() + `)`) +func capture(res ...string) string { + return `(` + expression(res...) + `)` } // anchored anchors the regular expression by adding start and end delimiters. -func anchored(res ...*regexp.Regexp) *regexp.Regexp { - return match(`^` + expression(res...).String() + `$`) +func anchored(res ...string) string { + return `^` + expression(res...) + `$` }