Merge pull request #3789 from thaJeztah/reference_consts

reference: clean up regular expressions
This commit is contained in:
Milos Gajdos 2022-11-25 12:38:43 +00:00 committed by GitHub
commit ede90db01c
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
5 changed files with 176 additions and 150 deletions

View file

@ -32,7 +32,7 @@ func FamiliarString(ref Reference) string {
} }
// FamiliarMatch reports whether ref matches the specified pattern. // FamiliarMatch reports whether ref matches the specified pattern.
// See https://godoc.org/path#Match for supported patterns. // See [path.Match] for supported patterns.
func FamiliarMatch(pattern string, ref Reference) (bool, error) { func FamiliarMatch(pattern string, ref Reference) (bool, error) {
matched, err := path.Match(pattern, FamiliarString(ref)) matched, err := path.Match(pattern, FamiliarString(ref))
if namedRef, isNamed := ref.(Named); isNamed && !matched { if namedRef, isNamed := ref.(Named); isNamed && !matched {

View file

@ -8,10 +8,35 @@ import (
) )
const ( const (
// legacyDefaultDomain is the legacy domain for Docker Hub (which was
// originally named "the Docker Index"). This domain is still used for
// authentication and image search, which were part of the "v1" Docker
// registry specification.
//
// This domain will continue to be supported, but there are plans to consolidate
// legacy domains to new "canonical" domains. Once those domains are decided
// on, we must update the normalization functions, but preserve compatibility
// with existing installs, clients, and user configuration.
legacyDefaultDomain = "index.docker.io" legacyDefaultDomain = "index.docker.io"
defaultDomain = "docker.io"
officialRepoPrefix = "library/" // defaultDomain is the default domain used for images on Docker Hub.
defaultTag = "latest" // It is used to normalize "familiar" names to canonical names, for example,
// to convert "ubuntu" to "docker.io/library/ubuntu:latest".
//
// Note that actual domain of Docker Hub's registry is registry-1.docker.io.
// This domain will continue to be supported, but there are plans to consolidate
// legacy domains to new "canonical" domains. Once those domains are decided
// on, we must update the normalization functions, but preserve compatibility
// with existing installs, clients, and user configuration.
defaultDomain = "docker.io"
// officialRepoPrefix is the namespace used for official images on Docker Hub.
// It is used to normalize "familiar" names to canonical names, for example,
// to convert "ubuntu" to "docker.io/library/ubuntu:latest".
officialRepoPrefix = "library/"
// defaultTag is the default tag if no tag is provided.
defaultTag = "latest"
) )
// normalizedNamed represents a name which has been // normalizedNamed represents a name which has been
@ -33,14 +58,14 @@ func ParseNormalizedNamed(s string) (Named, error) {
return nil, fmt.Errorf("invalid repository name (%s), cannot specify 64-byte hexadecimal strings", s) return nil, fmt.Errorf("invalid repository name (%s), cannot specify 64-byte hexadecimal strings", s)
} }
domain, remainder := splitDockerDomain(s) domain, remainder := splitDockerDomain(s)
var remoteName string var remote string
if tagSep := strings.IndexRune(remainder, ':'); tagSep > -1 { if tagSep := strings.IndexRune(remainder, ':'); tagSep > -1 {
remoteName = remainder[:tagSep] remote = remainder[:tagSep]
} else { } else {
remoteName = remainder remote = remainder
} }
if strings.ToLower(remoteName) != remoteName { if strings.ToLower(remote) != remote {
return nil, fmt.Errorf("invalid reference format: repository name (%s) must be lowercase", remoteName) return nil, fmt.Errorf("invalid reference format: repository name (%s) must be lowercase", remote)
} }
ref, err := Parse(domain + "/" + remainder) ref, err := Parse(domain + "/" + remainder)
@ -54,41 +79,53 @@ func ParseNormalizedNamed(s string) (Named, error) {
return named, nil return named, nil
} }
// ParseDockerRef normalizes the image reference following the docker convention. This is added // namedTaggedDigested is a reference that has both a tag and a digest.
// mainly for backward compatibility. type namedTaggedDigested interface {
// The reference returned can only be either tagged or digested. For reference contains both tag NamedTagged
// and digest, the function returns digested reference, e.g. docker.io/library/busybox:latest@ Digested
// sha256:7cc4b5aefd1d0cadf8d97d4350462ba51c694ebca145b08d7d41b41acc8db5aa will be returned as }
// docker.io/library/busybox@sha256:7cc4b5aefd1d0cadf8d97d4350462ba51c694ebca145b08d7d41b41acc8db5aa.
// ParseDockerRef normalizes the image reference following the docker convention,
// which allows for references to contain both a tag and a digest. It returns a
// reference that is either tagged or digested. For references containing both
// a tag and a digest, it returns a digested reference. For example, the following
// reference:
//
// docker.io/library/busybox:latest@sha256:7cc4b5aefd1d0cadf8d97d4350462ba51c694ebca145b08d7d41b41acc8db5aa
//
// Is returned as a digested reference (with the ":latest" tag removed):
//
// docker.io/library/busybox@sha256:7cc4b5aefd1d0cadf8d97d4350462ba51c694ebca145b08d7d41b41acc8db5aa
//
// References that are already "tagged" or "digested" are returned unmodified:
//
// // Already a digested reference
// docker.io/library/busybox@sha256:7cc4b5aefd1d0cadf8d97d4350462ba51c694ebca145b08d7d41b41acc8db5aa
//
// // Already a named reference
// docker.io/library/busybox:latest
func ParseDockerRef(ref string) (Named, error) { func ParseDockerRef(ref string) (Named, error) {
named, err := ParseNormalizedNamed(ref) named, err := ParseNormalizedNamed(ref)
if err != nil { if err != nil {
return nil, err return nil, err
} }
if _, ok := named.(NamedTagged); ok { if canonical, ok := named.(namedTaggedDigested); ok {
if canonical, ok := named.(Canonical); ok { // The reference is both tagged and digested; only return digested.
// The reference is both tagged and digested, only newNamed, err := WithName(canonical.Name())
// return digested. if err != nil {
newNamed, err := WithName(canonical.Name()) return nil, err
if err != nil {
return nil, err
}
newCanonical, err := WithDigest(newNamed, canonical.Digest())
if err != nil {
return nil, err
}
return newCanonical, nil
} }
return WithDigest(newNamed, canonical.Digest())
} }
return TagNameOnly(named), nil return TagNameOnly(named), nil
} }
// splitDockerDomain splits a repository name to domain and remotename string. // splitDockerDomain splits a repository name to domain and remote-name.
// If no valid domain is found, the default domain is used. Repository name // If no valid domain is found, the default domain is used. Repository name
// needs to be already validated before. // needs to be already validated before.
func splitDockerDomain(name string) (domain, remainder string) { func splitDockerDomain(name string) (domain, remainder string) {
i := strings.IndexRune(name, '/') i := strings.IndexRune(name, '/')
if i == -1 || (!strings.ContainsAny(name[:i], ".:") && name[:i] != "localhost" && strings.ToLower(name[:i]) == name[:i]) { if i == -1 || (!strings.ContainsAny(name[:i], ".:") && name[:i] != localhost && strings.ToLower(name[:i]) == name[:i]) {
domain, remainder = defaultDomain, name domain, remainder = defaultDomain, name
} else { } else {
domain, remainder = name[:i], name[i+1:] domain, remainder = name[:i], name[i+1:]
@ -96,11 +133,6 @@ func splitDockerDomain(name string) (domain, remainder string) {
if domain == legacyDefaultDomain { if domain == legacyDefaultDomain {
domain = defaultDomain domain = defaultDomain
} }
// TODO(thaJeztah): this check may be too strict, as it assumes the
// "library/" namespace does not have nested namespaces. While this
// is true (currently), technically it would be possible for Docker
// Hub to use those (e.g. "library/distros/ubuntu:latest").
// See https://github.com/distribution/distribution/pull/3769#issuecomment-1302031785.
if domain == defaultDomain && !strings.ContainsRune(remainder, '/') { if domain == defaultDomain && !strings.ContainsRune(remainder, '/') {
remainder = officialRepoPrefix + remainder remainder = officialRepoPrefix + remainder
} }

View file

@ -4,13 +4,14 @@
// Grammar // Grammar
// //
// reference := name [ ":" tag ] [ "@" digest ] // reference := name [ ":" tag ] [ "@" digest ]
// name := [domain '/'] path-component ['/' path-component]* // name := [domain '/'] remote-name
// domain := host [':' port-number] // domain := host [':' port-number]
// host := domain-name | IPv4address | \[ IPv6address \] ; rfc3986 appendix-A // host := domain-name | IPv4address | \[ IPv6address \] ; rfc3986 appendix-A
// domain-name := domain-component ['.' domain-component]* // domain-name := domain-component ['.' domain-component]*
// domain-component := /([a-zA-Z0-9]|[a-zA-Z0-9][a-zA-Z0-9-]*[a-zA-Z0-9])/ // domain-component := /([a-zA-Z0-9]|[a-zA-Z0-9][a-zA-Z0-9-]*[a-zA-Z0-9])/
// port-number := /[0-9]+/ // port-number := /[0-9]+/
// path-component := alpha-numeric [separator alpha-numeric]* // path-component := alpha-numeric [separator alpha-numeric]*
// path (or "remote-name") := path-component ['/' path-component]*
// alpha-numeric := /[a-z0-9]+/ // alpha-numeric := /[a-z0-9]+/
// separator := /[_.]|__|[-]*/ // separator := /[_.]|__|[-]*/
// //
@ -23,7 +24,6 @@
// digest-hex := /[0-9a-fA-F]{32,}/ ; At least 128 bit digest value // digest-hex := /[0-9a-fA-F]{32,}/ ; At least 128 bit digest value
// //
// identifier := /[a-f0-9]{64}/ // identifier := /[a-f0-9]{64}/
// short-identifier := /[a-f0-9]{6,64}/
package reference package reference
import ( import (
@ -147,7 +147,7 @@ type namedRepository interface {
Path() string Path() string
} }
// Domain returns the domain part of the Named reference // Domain returns the domain part of the [Named] reference.
func Domain(named Named) string { func Domain(named Named) string {
if r, ok := named.(namedRepository); ok { if r, ok := named.(namedRepository); ok {
return r.Domain() return r.Domain()
@ -156,7 +156,7 @@ func Domain(named Named) string {
return domain return domain
} }
// Path returns the name without the domain part of the Named reference // Path returns the name without the domain part of the [Named] reference.
func Path(named Named) (name string) { func Path(named Named) (name string) {
if r, ok := named.(namedRepository); ok { if r, ok := named.(namedRepository); ok {
return r.Path() return r.Path()
@ -188,7 +188,6 @@ func SplitHostname(named Named) (string, string) {
// Parse parses s and returns a syntactically valid Reference. // Parse parses s and returns a syntactically valid Reference.
// If an error was encountered it is returned, along with a nil Reference. // If an error was encountered it is returned, along with a nil Reference.
// NOTE: Parse will not handle short digests.
func Parse(s string) (Reference, error) { func Parse(s string) (Reference, error) {
matches := ReferenceRegexp.FindStringSubmatch(s) matches := ReferenceRegexp.FindStringSubmatch(s)
if matches == nil { if matches == nil {
@ -240,7 +239,6 @@ func Parse(s string) (Reference, error) {
// the Named interface. The reference must have a name and be in the canonical // the Named interface. The reference must have a name and be in the canonical
// form, otherwise an error is returned. // form, otherwise an error is returned.
// If an error was encountered it is returned, along with a nil Reference. // If an error was encountered it is returned, along with a nil Reference.
// NOTE: ParseNamed will not handle short digests.
func ParseNamed(s string) (Named, error) { func ParseNamed(s string) (Named, error) {
named, err := ParseNormalizedNamed(s) named, err := ParseNormalizedNamed(s)
if err != nil { if err != nil {

View file

@ -1,14 +1,50 @@
package reference package reference
import "regexp" import (
"regexp"
"strings"
)
var ( // DigestRegexp matches well-formed digests, including algorithm (e.g. "sha256:<encoded>").
// alphaNumeric defines the alpha numeric atom, typically a var DigestRegexp = regexp.MustCompile(digestPat)
// DomainRegexp matches hostname or IP-addresses, optionally including a port
// number. It defines the structure of potential domain components that may be
// part of image names. This is purposely a subset of what is allowed by DNS to
// ensure backwards compatibility with Docker image names. It may be a subset of
// DNS domain name, an IPv4 address in decimal format, or an IPv6 address between
// square brackets (excluding zone identifiers as defined by [RFC 6874] or special
// addresses such as IPv4-Mapped).
//
// [RFC 6874]: https://www.rfc-editor.org/rfc/rfc6874.
var DomainRegexp = regexp.MustCompile(domainAndPort)
// IdentifierRegexp is the format for string identifier used as a
// content addressable identifier using sha256. These identifiers
// are like digests without the algorithm, since sha256 is used.
var IdentifierRegexp = regexp.MustCompile(identifier)
// NameRegexp is the format for the name component of references, including
// an optional domain and port, but without tag or digest suffix.
var NameRegexp = regexp.MustCompile(namePat)
// ReferenceRegexp is the full supported format of a reference. The regexp
// is anchored and has capturing groups for name, tag, and digest
// components.
var ReferenceRegexp = regexp.MustCompile(referencePat)
// TagRegexp matches valid tag names. From [docker/docker:graph/tags.go].
//
// [docker/docker:graph/tags.go]: https://github.com/moby/moby/blob/v1.6.0/graph/tags.go#L26-L28
var TagRegexp = regexp.MustCompile(tag)
const (
// alphanumeric defines the alphanumeric atom, typically a
// component of names. This only allows lower case characters and digits. // component of names. This only allows lower case characters and digits.
alphaNumeric = `[a-z0-9]+` alphanumeric = `[a-z0-9]+`
// separator defines the separators allowed to be embedded in name // separator defines the separators allowed to be embedded in name
// components. This allow one period, one or two underscore and multiple // components. This allows one period, one or two underscore and multiple
// dashes. Repeated dashes and underscores are intentionally treated // dashes. Repeated dashes and underscores are intentionally treated
// differently. In order to support valid hostnames as name components, // differently. In order to support valid hostnames as name components,
// supporting repeated dash was added. Additionally double underscore is // supporting repeated dash was added. Additionally double underscore is
@ -16,33 +52,51 @@ var (
// supported names. // supported names.
separator = `(?:[._]|__|[-]*)` separator = `(?:[._]|__|[-]*)`
// nameComponent restricts registry path component names to start // localhost is treated as a special value for domain-name. Any other
// with at least one letter or number, with following parts able to be // domain-name without a "." or a ":port" are considered a path component.
// separated by one period, one or two underscore and multiple dashes. localhost = `localhost`
nameComponent = expression(
alphaNumeric,
optional(repeated(separator, alphaNumeric)))
// domainNameComponent restricts the registry domain component of a // domainNameComponent restricts the registry domain component of a
// repository name to start with a component as defined by DomainRegexp. // repository name to start with a component as defined by DomainRegexp.
domainNameComponent = `(?:[a-zA-Z0-9]|[a-zA-Z0-9][a-zA-Z0-9-]*[a-zA-Z0-9])` domainNameComponent = `(?:[a-zA-Z0-9]|[a-zA-Z0-9][a-zA-Z0-9-]*[a-zA-Z0-9])`
// optionalPort matches an optional port-number including the port separator
// (e.g. ":80").
optionalPort = `(?::[0-9]+)?`
// tag matches valid tag names. From docker/docker:graph/tags.go.
tag = `[\w][\w.-]{0,127}`
// digestPat matches well-formed digests, including algorithm (e.g. "sha256:<encoded>").
//
// TODO(thaJeztah): this should follow the same rules as https://pkg.go.dev/github.com/opencontainers/go-digest@v1.0.0#DigestRegexp
// so that go-digest defines the canonical format. Note that the go-digest is
// more relaxed:
// - it allows multiple algorithms (e.g. "sha256+b64:<encoded>") to allow
// future expansion of supported algorithms.
// - it allows the "<encoded>" value to use urlsafe base64 encoding as defined
// in [rfc4648, section 5].
//
// [rfc4648, section 5]: https://www.rfc-editor.org/rfc/rfc4648#section-5.
digestPat = `[A-Za-z][A-Za-z0-9]*(?:[-_+.][A-Za-z][A-Za-z0-9]*)*[:][[:xdigit:]]{32,}`
// identifier is the format for a content addressable identifier using sha256.
// These identifiers are like digests without the algorithm, since sha256 is used.
identifier = `([a-f0-9]{64})`
// ipv6address are enclosed between square brackets and may be represented // ipv6address are enclosed between square brackets and may be represented
// in many ways, see rfc5952. Only IPv6 in compressed or uncompressed format // in many ways, see rfc5952. Only IPv6 in compressed or uncompressed format
// are allowed, IPv6 zone identifiers (rfc6874) or Special addresses such as // are allowed, IPv6 zone identifiers (rfc6874) or Special addresses such as
// IPv4-Mapped are deliberately excluded. // IPv4-Mapped are deliberately excluded.
ipv6address = expression( ipv6address = `\[(?:[a-fA-F0-9:]+)\]`
literal(`[`), `(?:[a-fA-F0-9:]+)`, literal(`]`), )
)
var (
// domainName defines the structure of potential domain components // domainName defines the structure of potential domain components
// that may be part of image names. This is purposely a subset of what is // that may be part of image names. This is purposely a subset of what is
// allowed by DNS to ensure backwards compatibility with Docker image // allowed by DNS to ensure backwards compatibility with Docker image
// names. This includes IPv4 addresses on decimal format. // names. This includes IPv4 addresses on decimal format.
domainName = expression( domainName = domainNameComponent + anyTimes(`\.`+domainNameComponent)
domainNameComponent,
optional(repeated(literal(`.`), domainNameComponent)),
)
// host defines the structure of potential domains based on the URI // host defines the structure of potential domains based on the URI
// Host subcomponent on rfc3986. It may be a subset of DNS domain name, // Host subcomponent on rfc3986. It may be a subset of DNS domain name,
@ -53,117 +107,57 @@ var (
// allowed by the URI Host subcomponent on rfc3986 to ensure backwards // allowed by the URI Host subcomponent on rfc3986 to ensure backwards
// compatibility with Docker image names. // compatibility with Docker image names.
domain = expression( domainAndPort = host + optionalPort
host,
optional(literal(`:`), `[0-9]+`))
// DomainRegexp defines the structure of potential domain components
// that may be part of image names. This is purposely a subset of what is
// allowed by DNS to ensure backwards compatibility with Docker image
// names.
DomainRegexp = regexp.MustCompile(domain)
tag = `[\w][\w.-]{0,127}`
// TagRegexp matches valid tag names. From docker/docker:graph/tags.go.
TagRegexp = regexp.MustCompile(tag)
anchoredTag = anchored(tag)
// anchoredTagRegexp matches valid tag names, anchored at the start and // anchoredTagRegexp matches valid tag names, anchored at the start and
// end of the matched string. // end of the matched string.
anchoredTagRegexp = regexp.MustCompile(anchoredTag) anchoredTagRegexp = regexp.MustCompile(anchored(tag))
digestPat = `[A-Za-z][A-Za-z0-9]*(?:[-_+.][A-Za-z][A-Za-z0-9]*)*[:][[:xdigit:]]{32,}`
// DigestRegexp matches valid digests.
DigestRegexp = regexp.MustCompile(digestPat)
anchoredDigest = anchored(digestPat)
// anchoredDigestRegexp matches valid digests, anchored at the start and // anchoredDigestRegexp matches valid digests, anchored at the start and
// end of the matched string. // end of the matched string.
anchoredDigestRegexp = regexp.MustCompile(anchoredDigest) anchoredDigestRegexp = regexp.MustCompile(anchored(digestPat))
namePat = expression( // pathComponent restricts path-components to start with an alphanumeric
optional(domain, literal(`/`)), // character, with following parts able to be separated by a separator
nameComponent, // (one period, one or two underscore and multiple dashes).
optional(repeated(literal(`/`), nameComponent))) pathComponent = alphanumeric + anyTimes(separator+alphanumeric)
// NameRegexp is the format for the name component of references. The
// regexp has capturing groups for the domain and name part omitting // remoteName matches the remote-name of a repository. It consists of one
// the separating forward slash from either. // or more forward slash (/) delimited path-components:
NameRegexp = regexp.MustCompile(namePat) //
// pathComponent[[/pathComponent] ...] // e.g., "library/ubuntu"
remoteName = pathComponent + anyTimes(`/`+pathComponent)
namePat = optional(domainAndPort+`/`) + remoteName
anchoredName = anchored(
optional(capture(domain), literal(`/`)),
capture(nameComponent,
optional(repeated(literal(`/`), nameComponent))))
// anchoredNameRegexp is used to parse a name value, capturing the // anchoredNameRegexp is used to parse a name value, capturing the
// domain and trailing components. // domain and trailing components.
anchoredNameRegexp = regexp.MustCompile(anchoredName) anchoredNameRegexp = regexp.MustCompile(anchored(optional(capture(domainAndPort), `/`), capture(remoteName)))
referencePat = anchored(capture(namePat), referencePat = anchored(capture(namePat), optional(`:`, capture(tag)), optional(`@`, capture(digestPat)))
optional(literal(":"), capture(tag)),
optional(literal("@"), capture(digestPat)))
// ReferenceRegexp is the full supported format of a reference. The regexp
// is anchored and has capturing groups for name, tag, and digest
// components.
ReferenceRegexp = regexp.MustCompile(referencePat)
identifier = `([a-f0-9]{64})`
// IdentifierRegexp is the format for string identifier used as a
// content addressable identifier using sha256. These identifiers
// are like digests without the algorithm, since sha256 is used.
IdentifierRegexp = regexp.MustCompile(identifier)
anchoredIdentifier = anchored(identifier)
// anchoredIdentifierRegexp is used to check or match an // anchoredIdentifierRegexp is used to check or match an
// identifier value, anchored at start and end of string. // identifier value, anchored at start and end of string.
anchoredIdentifierRegexp = regexp.MustCompile(anchoredIdentifier) anchoredIdentifierRegexp = regexp.MustCompile(anchored(identifier))
) )
// literal compiles s into a literal regular expression, escaping any regexp
// reserved characters.
func literal(s string) string {
re := regexp.MustCompile(regexp.QuoteMeta(s))
if _, complete := re.LiteralPrefix(); !complete {
panic("must be a literal")
}
return re.String()
}
// expression defines a full expression, where each regular expression must
// follow the previous.
func expression(res ...string) string {
var s string
for _, re := range res {
s += re
}
return s
}
// optional wraps the expression in a non-capturing group and makes the // optional wraps the expression in a non-capturing group and makes the
// production optional. // production optional.
func optional(res ...string) string { func optional(res ...string) string {
return group(expression(res...)) + `?` return `(?:` + strings.Join(res, "") + `)?`
} }
// repeated wraps the regexp in a non-capturing group to get one or more // anyTimes wraps the expression in a non-capturing group that can occur
// matches. // any number of times.
func repeated(res ...string) string { func anyTimes(res ...string) string {
return group(expression(res...)) + `+` return `(?:` + strings.Join(res, "") + `)*`
}
// group wraps the regexp in a non-capturing group.
func group(res ...string) string {
return `(?:` + expression(res...) + `)`
} }
// capture wraps the expression in a capturing group. // capture wraps the expression in a capturing group.
func capture(res ...string) string { func capture(res ...string) string {
return `(` + expression(res...) + `)` return `(` + strings.Join(res, "") + `)`
} }
// anchored anchors the regular expression by adding start and end delimiters. // anchored anchors the regular expression by adding start and end delimiters.
func anchored(res ...string) string { func anchored(res ...string) string {
return `^` + expression(res...) + `$` return `^` + strings.Join(res, "") + `$`
} }

View file

@ -20,14 +20,16 @@ import (
"sort" "sort"
) )
// Sort sorts string references preferring higher information references // Sort sorts string references preferring higher information references.
//
// The precedence is as follows: // The precedence is as follows:
// 1. Name + Tag + Digest //
// 2. Name + Tag // 1. [Named] + [Tagged] + [Digested] (e.g., "docker.io/library/busybox:latest@sha256:<digest>")
// 3. Name + Digest // 2. [Named] + [Tagged] (e.g., "docker.io/library/busybox:latest")
// 4. Name // 3. [Named] + [Digested] (e.g., "docker.io/library/busybo@sha256:<digest>")
// 5. Digest // 4. [Named] (e.g., "docker.io/library/busybox")
// 6. Parse error // 5. [Digested] (e.g., "docker.io@sha256:<digest>")
// 6. Parse error
func Sort(references []string) []string { func Sort(references []string) []string {
var prefs []Reference var prefs []Reference
var bad []string var bad []string