From ebe86c6cec4b699397dc8c9d3e2a3b28c48fa3ab Mon Sep 17 00:00:00 2001 From: Nick Craig-Wood Date: Fri, 29 Jul 2022 17:01:59 +0100 Subject: [PATCH] s3: add --s3-decompress flag to download gzip-encoded files Before this change, if an object compressed with "Content-Encoding: gzip" was downloaded, a length and hash mismatch would occur since the go runtime automatically decompressed the object on download. If --s3-decompress is set, this change erases the length and hash on compressed objects so they can be downloaded successfully, at the cost of not being able to check the length or the hash of the downloaded object. If --s3-decompress is not set the compressed files will be downloaded as-is providing compressed objects with intact size and hash information. See #2658 --- backend/s3/s3.go | 77 ++++++++++++++++++++++++++-------- backend/s3/s3_internal_test.go | 35 +++++++++++++++- 2 files changed, 93 insertions(+), 19 deletions(-) diff --git a/backend/s3/s3.go b/backend/s3/s3.go index 78f23b3ed..f0a02b391 100644 --- a/backend/s3/s3.go +++ b/backend/s3/s3.go @@ -2004,6 +2004,19 @@ See [the time option docs](/docs/#time-option) for valid formats. `, Default: fs.Time{}, Advanced: true, + }, { + Name: "decompress", + Help: `If set this will decompress gzip encoded objects. + +It is possible to upload objects to S3 with "Content-Encoding: gzip" +set. Normally rclone will download these files files as compressed objects. + +If this flag is set then rclone will decompress these files with +"Content-Encoding: gzip" as they are received. This means that rclone +can't check the size and hash but the file contents will be decompressed. +`, + Advanced: true, + Default: false, }, }}) } @@ -2128,28 +2141,30 @@ type Options struct { UsePresignedRequest bool `config:"use_presigned_request"` Versions bool `config:"versions"` VersionAt fs.Time `config:"version_at"` + Decompress bool `config:"decompress"` } // Fs represents a remote s3 server type Fs struct { - name string // the name of the remote - root string // root of the bucket - ignore all objects above this - opt Options // parsed options - ci *fs.ConfigInfo // global config - ctx context.Context // global context for reading config - features *fs.Features // optional features - c *s3.S3 // the connection to the s3 server - ses *session.Session // the s3 session - rootBucket string // bucket part of root (if any) - rootDirectory string // directory part of root (if any) - cache *bucket.Cache // cache for bucket creation status - pacer *fs.Pacer // To pace the API calls - srv *http.Client // a plain http client - srvRest *rest.Client // the rest connection to the server - pool *pool.Pool // memory pool - etagIsNotMD5 bool // if set ETags are not MD5s - versioningMu sync.Mutex - versioning fs.Tristate // if set bucket is using versions + name string // the name of the remote + root string // root of the bucket - ignore all objects above this + opt Options // parsed options + ci *fs.ConfigInfo // global config + ctx context.Context // global context for reading config + features *fs.Features // optional features + c *s3.S3 // the connection to the s3 server + ses *session.Session // the s3 session + rootBucket string // bucket part of root (if any) + rootDirectory string // directory part of root (if any) + cache *bucket.Cache // cache for bucket creation status + pacer *fs.Pacer // To pace the API calls + srv *http.Client // a plain http client + srvRest *rest.Client // the rest connection to the server + pool *pool.Pool // memory pool + etagIsNotMD5 bool // if set ETags are not MD5s + versioningMu sync.Mutex + versioning fs.Tristate // if set bucket is using versions + warnCompressed sync.Once // warn once about compressed files } // Object describes a s3 object @@ -4318,6 +4333,10 @@ func (o *Object) Hash(ctx context.Context, t hash.Type) (string, error) { if t != hash.MD5 { return "", hash.ErrUnsupported } + // If decompressing, erase the hash + if o.bytes < 0 { + return "", nil + } // If we haven't got an MD5, then check the metadata if o.md5 == "" { err := o.readMetaData(ctx) @@ -4439,6 +4458,12 @@ func (o *Object) setMetaData(resp *s3.HeadObjectOutput) { o.contentDisposition = resp.ContentDisposition o.contentEncoding = resp.ContentEncoding o.contentLanguage = resp.ContentLanguage + + // If decompressing then size and md5sum are unknown + if o.fs.opt.Decompress && aws.StringValue(o.contentEncoding) == "gzip" { + o.bytes = -1 + o.md5 = "" + } } // ModTime returns the modification time of the object @@ -4596,6 +4621,11 @@ func (o *Object) Open(ctx context.Context, options ...fs.OpenOption) (in io.Read } httpReq, resp := o.fs.c.GetObjectRequest(&req) fs.FixRangeOption(options, o.bytes) + + // Override the automatic decompression in the transport to + // download compressed files as-is + httpReq.HTTPRequest.Header.Set("Accept-Encoding", "gzip") + for _, option := range options { switch option.(type) { case *fs.RangeOption, *fs.SeekOption: @@ -4646,6 +4676,17 @@ func (o *Object) Open(ctx context.Context, options ...fs.OpenOption) (in io.Read setFrom_s3HeadObjectOutput_s3GetObjectOutput(&head, resp) head.ContentLength = size o.setMetaData(&head) + + // Decompress body if necessary + if aws.StringValue(resp.ContentEncoding) == "gzip" { + if o.fs.opt.Decompress { + return readers.NewGzipReader(resp.Body) + } + o.fs.warnCompressed.Do(func() { + fs.Logf(o, "Not decompressing 'Content-Encoding: gzip' compressed file. Use --s3-decompress to override") + }) + } + return resp.Body, nil } diff --git a/backend/s3/s3_internal_test.go b/backend/s3/s3_internal_test.go index 948b5242e..be12cc42d 100644 --- a/backend/s3/s3_internal_test.go +++ b/backend/s3/s3_internal_test.go @@ -4,6 +4,7 @@ import ( "bytes" "compress/gzip" "context" + "crypto/md5" "fmt" "testing" "time" @@ -11,6 +12,7 @@ import ( "github.com/aws/aws-sdk-go/aws" "github.com/aws/aws-sdk-go/service/s3" "github.com/rclone/rclone/fs" + "github.com/rclone/rclone/fs/hash" "github.com/rclone/rclone/fstest" "github.com/rclone/rclone/fstest/fstests" "github.com/rclone/rclone/lib/random" @@ -29,9 +31,15 @@ func gz(t *testing.T, s string) string { return buf.String() } +func md5sum(t *testing.T, s string) string { + hash := md5.Sum([]byte(s)) + return fmt.Sprintf("%x", hash) +} + func (f *Fs) InternalTestMetadata(t *testing.T) { ctx := context.Background() - contents := gz(t, random.String(1000)) + original := random.String(1000) + contents := gz(t, original) item := fstest.NewItem("test-metadata", contents, fstest.Time("2001-05-06T04:05:06.499999999Z")) btime := time.Now() @@ -68,6 +76,31 @@ func (f *Fs) InternalTestMetadata(t *testing.T) { assert.Equal(t, v, got, k) } } + + t.Run("GzipEncoding", func(t *testing.T) { + // Test that the gziped file we uploaded can be + // downloaded with and without decompression + checkDownload := func(wantContents string, wantSize int64, wantHash string) { + gotContents := fstests.ReadObject(ctx, t, o, -1) + assert.Equal(t, wantContents, gotContents) + assert.Equal(t, wantSize, o.Size()) + gotHash, err := o.Hash(ctx, hash.MD5) + require.NoError(t, err) + assert.Equal(t, wantHash, gotHash) + } + + t.Run("NoDecompress", func(t *testing.T) { + checkDownload(contents, int64(len(contents)), md5sum(t, contents)) + }) + t.Run("Decompress", func(t *testing.T) { + f.opt.Decompress = true + defer func() { + f.opt.Decompress = false + }() + checkDownload(original, -1, "") + }) + + }) } func (f *Fs) InternalTestNoHead(t *testing.T) {