From 1c89ce5fc1cec85e0726f598bccc12b873d04de3 Mon Sep 17 00:00:00 2001
From: Paul Cacheux <paul.cacheux@datadoghq.com>
Date: Thu, 13 Jan 2022 16:49:03 +0100
Subject: [PATCH] Improve how reference regexps are built

Previous implementation was doing a lot of string -> regexp -> string
conversions

Signed-off-by: Paul Cacheux <paul.cacheux@datadoghq.com>
---
 reference/regexp.go | 114 ++++++++++++++++++++++++--------------------
 1 file changed, 63 insertions(+), 51 deletions(-)

diff --git a/reference/regexp.go b/reference/regexp.go
index 78e2f9170..b738cdf84 100644
--- a/reference/regexp.go
+++ b/reference/regexp.go
@@ -3,145 +3,157 @@ package reference
 import "regexp"
 
 var (
-	// alphaNumericRegexp defines the alpha numeric atom, typically a
+	// alphaNumeric defines the alpha numeric atom, typically a
 	// component of names. This only allows lower case characters and digits.
-	alphaNumericRegexp = match(`[a-z0-9]+`)
+	alphaNumeric = `[a-z0-9]+`
 
-	// separatorRegexp defines the separators allowed to be embedded in name
+	// separator defines the separators allowed to be embedded in name
 	// components. This allow one period, one or two underscore and multiple
 	// dashes. Repeated dashes and underscores are intentionally treated
 	// differently. In order to support valid hostnames as name components,
 	// supporting repeated dash was added. Additionally double underscore is
 	// now allowed as a separator to loosen the restriction for previously
 	// supported names.
-	separatorRegexp = match(`(?:[._]|__|[-]*)`)
+	separator = `(?:[._]|__|[-]*)`
 
-	// nameComponentRegexp restricts registry path component names to start
+	// nameComponent restricts registry path component names to start
 	// with at least one letter or number, with following parts able to be
 	// separated by one period, one or two underscore and multiple dashes.
-	nameComponentRegexp = expression(
-		alphaNumericRegexp,
-		optional(repeated(separatorRegexp, alphaNumericRegexp)))
+	nameComponent = expression(
+		alphaNumeric,
+		optional(repeated(separator, alphaNumeric)))
 
-	// domainComponentRegexp restricts the registry domain component of a
+	// domainComponent restricts the registry domain component of a
 	// repository name to start with a component as defined by DomainRegexp
 	// and followed by an optional port.
-	domainComponentRegexp = match(`(?:[a-zA-Z0-9]|[a-zA-Z0-9][a-zA-Z0-9-]*[a-zA-Z0-9])`)
+	domainComponent = `(?:[a-zA-Z0-9]|[a-zA-Z0-9][a-zA-Z0-9-]*[a-zA-Z0-9])`
 
+	domain = expression(
+		domainComponent,
+		optional(repeated(literal(`.`), domainComponent)),
+		optional(literal(`:`), `[0-9]+`))
 	// DomainRegexp defines the structure of potential domain components
 	// that may be part of image names. This is purposely a subset of what is
 	// allowed by DNS to ensure backwards compatibility with Docker image
 	// names.
-	DomainRegexp = expression(
-		domainComponentRegexp,
-		optional(repeated(literal(`.`), domainComponentRegexp)),
-		optional(literal(`:`), match(`[0-9]+`)))
+	DomainRegexp = re(domain)
 
+	tag = `[\w][\w.-]{0,127}`
 	// TagRegexp matches valid tag names. From docker/docker:graph/tags.go.
-	TagRegexp = match(`[\w][\w.-]{0,127}`)
+	TagRegexp = re(tag)
 
+	anchoredTag = anchored(tag)
 	// anchoredTagRegexp matches valid tag names, anchored at the start and
 	// end of the matched string.
-	anchoredTagRegexp = anchored(TagRegexp)
+	anchoredTagRegexp = re(anchoredTag)
 
+	digestPat = `[A-Za-z][A-Za-z0-9]*(?:[-_+.][A-Za-z][A-Za-z0-9]*)*[:][[:xdigit:]]{32,}`
 	// DigestRegexp matches valid digests.
-	DigestRegexp = match(`[A-Za-z][A-Za-z0-9]*(?:[-_+.][A-Za-z][A-Za-z0-9]*)*[:][[:xdigit:]]{32,}`)
+	DigestRegexp = re(digestPat)
 
+	anchoredDigest = anchored(digestPat)
 	// anchoredDigestRegexp matches valid digests, anchored at the start and
 	// end of the matched string.
-	anchoredDigestRegexp = anchored(DigestRegexp)
+	anchoredDigestRegexp = re(anchoredDigest)
 
+	namePat = expression(
+		optional(domain, literal(`/`)),
+		nameComponent,
+		optional(repeated(literal(`/`), nameComponent)))
 	// NameRegexp is the format for the name component of references. The
 	// regexp has capturing groups for the domain and name part omitting
 	// the separating forward slash from either.
-	NameRegexp = expression(
-		optional(DomainRegexp, literal(`/`)),
-		nameComponentRegexp,
-		optional(repeated(literal(`/`), nameComponentRegexp)))
+	NameRegexp = re(namePat)
 
+	anchoredName = anchored(
+		optional(capture(domain), literal(`/`)),
+		capture(nameComponent,
+			optional(repeated(literal(`/`), nameComponent))))
 	// anchoredNameRegexp is used to parse a name value, capturing the
 	// domain and trailing components.
-	anchoredNameRegexp = anchored(
-		optional(capture(DomainRegexp), literal(`/`)),
-		capture(nameComponentRegexp,
-			optional(repeated(literal(`/`), nameComponentRegexp))))
+	anchoredNameRegexp = re(anchoredName)
 
+	referencePat = anchored(capture(namePat),
+		optional(literal(":"), capture(tag)),
+		optional(literal("@"), capture(digestPat)))
 	// ReferenceRegexp is the full supported format of a reference. The regexp
 	// is anchored and has capturing groups for name, tag, and digest
 	// components.
-	ReferenceRegexp = anchored(capture(NameRegexp),
-		optional(literal(":"), capture(TagRegexp)),
-		optional(literal("@"), capture(DigestRegexp)))
+	ReferenceRegexp = re(referencePat)
 
+	identifier = `([a-f0-9]{64})`
 	// IdentifierRegexp is the format for string identifier used as a
 	// content addressable identifier using sha256. These identifiers
 	// are like digests without the algorithm, since sha256 is used.
-	IdentifierRegexp = match(`([a-f0-9]{64})`)
+	IdentifierRegexp = re(identifier)
 
+	shortIdentifier = `([a-f0-9]{6,64})`
 	// ShortIdentifierRegexp is the format used to represent a prefix
 	// of an identifier. A prefix may be used to match a sha256 identifier
 	// within a list of trusted identifiers.
-	ShortIdentifierRegexp = match(`([a-f0-9]{6,64})`)
+	ShortIdentifierRegexp = re(shortIdentifier)
 
+	anchoredIdentifier = anchored(identifier)
 	// anchoredIdentifierRegexp is used to check or match an
 	// identifier value, anchored at start and end of string.
-	anchoredIdentifierRegexp = anchored(IdentifierRegexp)
+	anchoredIdentifierRegexp = re(anchoredIdentifier)
 
+	anchoredShortIdentifier = anchored(shortIdentifier)
 	// anchoredShortIdentifierRegexp is used to check if a value
 	// is a possible identifier prefix, anchored at start and end
 	// of string.
-	anchoredShortIdentifierRegexp = anchored(ShortIdentifierRegexp)
+	anchoredShortIdentifierRegexp = re(anchoredShortIdentifier)
 )
 
-// match compiles the string to a regular expression.
-var match = regexp.MustCompile
+// re compiles the string to a regular expression.
+var re = regexp.MustCompile
 
 // literal compiles s into a literal regular expression, escaping any regexp
 // reserved characters.
-func literal(s string) *regexp.Regexp {
-	re := match(regexp.QuoteMeta(s))
+func literal(s string) string {
+	re := re(regexp.QuoteMeta(s))
 
 	if _, complete := re.LiteralPrefix(); !complete {
 		panic("must be a literal")
 	}
 
-	return re
+	return re.String()
 }
 
 // expression defines a full expression, where each regular expression must
 // follow the previous.
-func expression(res ...*regexp.Regexp) *regexp.Regexp {
+func expression(res ...string) string {
 	var s string
 	for _, re := range res {
-		s += re.String()
+		s += re
 	}
 
-	return match(s)
+	return s
 }
 
 // optional wraps the expression in a non-capturing group and makes the
 // production optional.
-func optional(res ...*regexp.Regexp) *regexp.Regexp {
-	return match(group(expression(res...)).String() + `?`)
+func optional(res ...string) string {
+	return group(expression(res...)) + `?`
 }
 
 // repeated wraps the regexp in a non-capturing group to get one or more
 // matches.
-func repeated(res ...*regexp.Regexp) *regexp.Regexp {
-	return match(group(expression(res...)).String() + `+`)
+func repeated(res ...string) string {
+	return group(expression(res...)) + `+`
 }
 
 // group wraps the regexp in a non-capturing group.
-func group(res ...*regexp.Regexp) *regexp.Regexp {
-	return match(`(?:` + expression(res...).String() + `)`)
+func group(res ...string) string {
+	return `(?:` + expression(res...) + `)`
 }
 
 // capture wraps the expression in a capturing group.
-func capture(res ...*regexp.Regexp) *regexp.Regexp {
-	return match(`(` + expression(res...).String() + `)`)
+func capture(res ...string) string {
+	return `(` + expression(res...) + `)`
 }
 
 // anchored anchors the regular expression by adding start and end delimiters.
-func anchored(res ...*regexp.Regexp) *regexp.Regexp {
-	return match(`^` + expression(res...).String() + `$`)
+func anchored(res ...string) string {
+	return `^` + expression(res...) + `$`
 }