Delete S3 keys incrementally in batches

Instead of first collecting all keys and then batch deleting them,
we will do the incremental delete _online_ per max allowed batch.
Doing this prevents frequent allocations for large S3 keyspaces
and OOM-kills that might happen as a result of those.

This commit introduces storagedriver.Errors type that allows to return
multierrors as a single error from any storage driver implementation.

Signed-off-by: Milos Gajdos <milosthegajdos@gmail.com>
This commit is contained in:
Milos Gajdos 2022-04-23 12:37:09 +01:00
parent 5fe693474e
commit ebc4234fd5
No known key found for this signature in database
GPG key ID: 01300E5E6D417439
3 changed files with 71 additions and 43 deletions

View file

@ -15,6 +15,7 @@ import (
"bytes" "bytes"
"context" "context"
"crypto/tls" "crypto/tls"
"errors"
"fmt" "fmt"
"io" "io"
"io/ioutil" "io/ioutil"
@ -900,54 +901,71 @@ func (d *driver) copy(ctx context.Context, sourcePath string, destPath string) e
return err return err
} }
func min(a, b int) int {
if a < b {
return a
}
return b
}
// Delete recursively deletes all objects stored at "path" and its subpaths. // Delete recursively deletes all objects stored at "path" and its subpaths.
// We must be careful since S3 does not guarantee read after delete consistency // We must be careful since S3 does not guarantee read after delete consistency
func (d *driver) Delete(ctx context.Context, path string) error { func (d *driver) Delete(ctx context.Context, path string) error {
s3Objects := make([]*s3.ObjectIdentifier, 0, listMax) s3Objects := make([]*s3.ObjectIdentifier, 0, listMax)
s3Path := d.s3Path(path)
// manually add the given path if it's a file
stat, err := d.Stat(ctx, path)
if err != nil {
return err
}
if stat != nil && !stat.IsDir() {
path := d.s3Path(path)
s3Objects = append(s3Objects, &s3.ObjectIdentifier{
Key: &path,
})
}
// list objects under the given path as a subpath (suffix with slash "/")
s3Path := d.s3Path(path) + "/"
listObjectsInput := &s3.ListObjectsV2Input{ listObjectsInput := &s3.ListObjectsV2Input{
Bucket: aws.String(d.Bucket), Bucket: aws.String(d.Bucket),
Prefix: aws.String(s3Path), Prefix: aws.String(s3Path),
} }
ListLoop:
for { for {
// list all the objects // list all the objects
resp, err := d.S3.ListObjectsV2(listObjectsInput) resp, err := d.S3.ListObjectsV2(listObjectsInput)
// resp.Contents can only be empty on the first call // resp.Contents can only be empty on the first call
// if there were no more results to return after the first call, resp.IsTruncated would have been false // if there were no more results to return after the first call, resp.IsTruncated would have been false
// and the loop would be exited without recalling ListObjects // and the loop would exit without recalling ListObjects
if err != nil || len(resp.Contents) == 0 { if err != nil || len(resp.Contents) == 0 {
break ListLoop return storagedriver.PathNotFoundError{Path: path}
} }
for _, key := range resp.Contents { for _, key := range resp.Contents {
// Skip if we encounter a key that is not a subpath (so that deleting "/a" does not delete "/ab").
if len(*key.Key) > len(s3Path) && (*key.Key)[len(s3Path)] != '/' {
continue
}
s3Objects = append(s3Objects, &s3.ObjectIdentifier{ s3Objects = append(s3Objects, &s3.ObjectIdentifier{
Key: key.Key, Key: key.Key,
}) })
} }
// Delete objects only if the list is not empty, otherwise S3 API returns a cryptic error
if len(s3Objects) > 0 {
// NOTE: according to AWS docs https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html
// by default the response returns up to 1,000 key names. The response _might_ contain fewer keys but it will never contain more.
// 10000 keys is coincidentally (?) also the max number of keys that can be deleted in a single Delete operation, so we'll just smack
// Delete here straight away and reset the object slice when successful.
resp, err := d.S3.DeleteObjects(&s3.DeleteObjectsInput{
Bucket: aws.String(d.Bucket),
Delete: &s3.Delete{
Objects: s3Objects,
Quiet: aws.Bool(false),
},
})
if err != nil {
return err
}
if len(resp.Errors) > 0 {
// NOTE: AWS SDK s3.Error does not implement error interface which
// is pretty intensely sad, so we have to do away with this for now.
errs := make([]error, 0, len(resp.Errors))
for _, err := range resp.Errors {
errs = append(errs, errors.New(err.String()))
}
return storagedriver.Errors{
DriverName: driverName,
Errs: errs,
}
}
}
// NOTE: we don't want to reallocate
// the slice so we simply "reset" it
s3Objects = s3Objects[:0]
// resp.Contents must have at least one element or we would have returned not found // resp.Contents must have at least one element or we would have returned not found
listObjectsInput.StartAfter = resp.Contents[len(resp.Contents)-1].Key listObjectsInput.StartAfter = resp.Contents[len(resp.Contents)-1].Key
@ -958,24 +976,6 @@ ListLoop:
} }
} }
total := len(s3Objects)
if total == 0 {
return storagedriver.PathNotFoundError{Path: path}
}
// need to chunk objects into groups of 1000 per s3 restrictions
for i := 0; i < total; i += 1000 {
_, err := d.S3.DeleteObjects(&s3.DeleteObjectsInput{
Bucket: aws.String(d.Bucket),
Delete: &s3.Delete{
Objects: s3Objects[i:min(i+1000, total)],
Quiet: aws.Bool(false),
},
})
if err != nil {
return err
}
}
return nil return nil
} }

View file

@ -782,6 +782,10 @@ func TestMoveWithMultipartCopy(t *testing.T) {
} }
func TestListObjectsV2(t *testing.T) { func TestListObjectsV2(t *testing.T) {
if skipS3() != "" {
t.Skip(skipS3())
}
rootDir, err := ioutil.TempDir("", "driver-") rootDir, err := ioutil.TempDir("", "driver-")
if err != nil { if err != nil {
t.Fatalf("unexpected error creating temporary directory: %v", err) t.Fatalf("unexpected error creating temporary directory: %v", err)

View file

@ -169,3 +169,27 @@ type Error struct {
func (err Error) Error() string { func (err Error) Error() string {
return fmt.Sprintf("%s: %s", err.DriverName, err.Enclosed) return fmt.Sprintf("%s: %s", err.DriverName, err.Enclosed)
} }
// Errors provides the envelope for multiple errors
// for use within the storagedriver implementations.
type Errors struct {
DriverName string
Errs []error
}
var _ error = Errors{}
func (e Errors) Error() string {
switch len(e.Errs) {
case 0:
return "<nil>"
case 1:
return e.Errs[0].Error()
default:
msg := "errors:\n"
for _, err := range e.Errs {
msg += err.Error() + "\n"
}
return msg
}
}