distribution/registry/storage/driver/walk.go
James Hewitt e22f7cbc73
Pass the last paging flag to storage drivers
Storage drivers may be able to take advantage of the hint to start
their walk more efficiently.

For S3: The API takes a start-after parameter. Registries with many
repositories can drastically reduce calls to s3 by telling s3 to only
list results lexographically after the last parameter.

For the fallback: We can start deeper in the tree and avoid statting
the files and directories before the hint in a walk. For a filesystem
this improves performance a little, but many of the API based drivers
are currently treated like a filesystem, so this drastically improves
the performance of GCP and Azure blob.

Signed-off-by: James Hewitt <james.hewitt@uk.ibm.com>
2023-08-29 11:27:42 +01:00

125 lines
4.2 KiB
Go

package driver
import (
"context"
"errors"
"path/filepath"
"sort"
"strings"
"github.com/sirupsen/logrus"
)
// ErrSkipDir is used as a return value from onFileFunc to indicate that
// the directory named in the call is to be skipped. It is not returned
// as an error by any function.
var ErrSkipDir = errors.New("skip this directory")
// ErrFilledBuffer is used as a return value from onFileFunc to indicate
// that the requested number of entries has been reached and the walk can
// stop.
var ErrFilledBuffer = errors.New("we have enough entries")
// WalkFn is called once per file by Walk
type WalkFn func(fileInfo FileInfo) error
// WalkFallback traverses a filesystem defined within driver, starting
// from the given path, calling f on each file. It uses the List method and Stat to drive itself.
// If the returned error from the WalkFn is ErrSkipDir the directory will not be entered and Walk
// will continue the traversal. If the returned error from the WalkFn is ErrFilledBuffer, the walk
// stops.
func WalkFallback(ctx context.Context, driver StorageDriver, from string, f WalkFn, options ...func(*WalkOptions)) error {
walkOptions := &WalkOptions{}
for _, o := range options {
o(walkOptions)
}
startAfterHint := walkOptions.StartAfterHint
// Ensure that we are checking the hint is contained within from by adding a "/".
// Add to both in case the hint and form are the same, which would still count.
rel, err := filepath.Rel(from, startAfterHint)
if err != nil || strings.HasPrefix(rel, "..") {
// The startAfterHint is outside from, so check if we even need to walk anything
// Replace any path separators with \x00 so that the sort works in a depth-first way
if strings.ReplaceAll(startAfterHint, "/", "\x00") < strings.ReplaceAll(from, "/", "\x00") {
_, err := doWalkFallback(ctx, driver, from, "", f)
return err
}
} else {
// The startAfterHint is within from.
// Walk up the tree until we hit from - we know it is contained.
// Ensure startAfterHint is never deeper than a child of the base
// directory so that doWalkFallback doesn't have to worry about
// depth-first comparisons
base := startAfterHint
for strings.HasPrefix(base, from) {
_, err = doWalkFallback(ctx, driver, base, startAfterHint, f)
switch err.(type) {
case nil:
// No error
case PathNotFoundError:
// dir doesn't exist, so nothing to walk
default:
return err
}
if base == from {
break
}
startAfterHint = base
base, _ = filepath.Split(startAfterHint)
if len(base) > 1 {
base = strings.TrimSuffix(base, "/")
}
}
}
return nil
}
// doWalkFallback performs a depth first walk using recursion.
// from is the directory that this iteration of the function should walk.
// startAfterHint is the child within from to start the walk after. It should only ever be a child of from, or the empty string.
func doWalkFallback(ctx context.Context, driver StorageDriver, from string, startAfterHint string, f WalkFn) (bool, error) {
children, err := driver.List(ctx, from)
if err != nil {
return false, err
}
sort.Strings(children)
for _, child := range children {
// The startAfterHint has been sanitised in WalkFallback and will either be
// empty, or be suitable for an <= check for this _from_.
if child <= startAfterHint {
continue
}
// TODO(stevvooe): Calling driver.Stat for every entry is quite
// expensive when running against backends with a slow Stat
// implementation, such as GCS. This is very likely a serious
// performance bottleneck.
// Those backends should have custom walk functions. See S3.
fileInfo, err := driver.Stat(ctx, child)
if err != nil {
switch err.(type) {
case PathNotFoundError:
// repository was removed in between listing and enumeration. Ignore it.
logrus.WithField("path", child).Infof("ignoring deleted path")
continue
default:
return false, err
}
}
err = f(fileInfo)
if err == nil && fileInfo.IsDir() {
if ok, err := doWalkFallback(ctx, driver, child, startAfterHint, f); err != nil || !ok {
return ok, err
}
} else if err == ErrSkipDir {
// don't traverse into this directory
} else if err == ErrFilledBuffer {
return false, nil // no error but stop iteration
} else if err != nil {
return false, err
}
}
return true, nil
}