s3: add --s3-decompress flag to download gzip-encoded files

Before this change, if an object compressed with "Content-Encoding:
gzip" was downloaded, a length and hash mismatch would occur since the
go runtime automatically decompressed the object on download.

If --s3-decompress is set, this change erases the length and hash on
compressed objects so they can be downloaded successfully, at the cost
of not being able to check the length or the hash of the downloaded
object.

If --s3-decompress is not set the compressed files will be downloaded
as-is providing compressed objects with intact size and hash
information.

See #2658
This commit is contained in:
Nick Craig-Wood 2022-07-29 17:01:59 +01:00
parent 1f5e7ce598
commit ebe86c6cec
2 changed files with 93 additions and 19 deletions

View file

@ -2004,6 +2004,19 @@ See [the time option docs](/docs/#time-option) for valid formats.
`,
Default: fs.Time{},
Advanced: true,
}, {
Name: "decompress",
Help: `If set this will decompress gzip encoded objects.
It is possible to upload objects to S3 with "Content-Encoding: gzip"
set. Normally rclone will download these files files as compressed objects.
If this flag is set then rclone will decompress these files with
"Content-Encoding: gzip" as they are received. This means that rclone
can't check the size and hash but the file contents will be decompressed.
`,
Advanced: true,
Default: false,
},
}})
}
@ -2128,28 +2141,30 @@ type Options struct {
UsePresignedRequest bool `config:"use_presigned_request"`
Versions bool `config:"versions"`
VersionAt fs.Time `config:"version_at"`
Decompress bool `config:"decompress"`
}
// Fs represents a remote s3 server
type Fs struct {
name string // the name of the remote
root string // root of the bucket - ignore all objects above this
opt Options // parsed options
ci *fs.ConfigInfo // global config
ctx context.Context // global context for reading config
features *fs.Features // optional features
c *s3.S3 // the connection to the s3 server
ses *session.Session // the s3 session
rootBucket string // bucket part of root (if any)
rootDirectory string // directory part of root (if any)
cache *bucket.Cache // cache for bucket creation status
pacer *fs.Pacer // To pace the API calls
srv *http.Client // a plain http client
srvRest *rest.Client // the rest connection to the server
pool *pool.Pool // memory pool
etagIsNotMD5 bool // if set ETags are not MD5s
versioningMu sync.Mutex
versioning fs.Tristate // if set bucket is using versions
name string // the name of the remote
root string // root of the bucket - ignore all objects above this
opt Options // parsed options
ci *fs.ConfigInfo // global config
ctx context.Context // global context for reading config
features *fs.Features // optional features
c *s3.S3 // the connection to the s3 server
ses *session.Session // the s3 session
rootBucket string // bucket part of root (if any)
rootDirectory string // directory part of root (if any)
cache *bucket.Cache // cache for bucket creation status
pacer *fs.Pacer // To pace the API calls
srv *http.Client // a plain http client
srvRest *rest.Client // the rest connection to the server
pool *pool.Pool // memory pool
etagIsNotMD5 bool // if set ETags are not MD5s
versioningMu sync.Mutex
versioning fs.Tristate // if set bucket is using versions
warnCompressed sync.Once // warn once about compressed files
}
// Object describes a s3 object
@ -4318,6 +4333,10 @@ func (o *Object) Hash(ctx context.Context, t hash.Type) (string, error) {
if t != hash.MD5 {
return "", hash.ErrUnsupported
}
// If decompressing, erase the hash
if o.bytes < 0 {
return "", nil
}
// If we haven't got an MD5, then check the metadata
if o.md5 == "" {
err := o.readMetaData(ctx)
@ -4439,6 +4458,12 @@ func (o *Object) setMetaData(resp *s3.HeadObjectOutput) {
o.contentDisposition = resp.ContentDisposition
o.contentEncoding = resp.ContentEncoding
o.contentLanguage = resp.ContentLanguage
// If decompressing then size and md5sum are unknown
if o.fs.opt.Decompress && aws.StringValue(o.contentEncoding) == "gzip" {
o.bytes = -1
o.md5 = ""
}
}
// ModTime returns the modification time of the object
@ -4596,6 +4621,11 @@ func (o *Object) Open(ctx context.Context, options ...fs.OpenOption) (in io.Read
}
httpReq, resp := o.fs.c.GetObjectRequest(&req)
fs.FixRangeOption(options, o.bytes)
// Override the automatic decompression in the transport to
// download compressed files as-is
httpReq.HTTPRequest.Header.Set("Accept-Encoding", "gzip")
for _, option := range options {
switch option.(type) {
case *fs.RangeOption, *fs.SeekOption:
@ -4646,6 +4676,17 @@ func (o *Object) Open(ctx context.Context, options ...fs.OpenOption) (in io.Read
setFrom_s3HeadObjectOutput_s3GetObjectOutput(&head, resp)
head.ContentLength = size
o.setMetaData(&head)
// Decompress body if necessary
if aws.StringValue(resp.ContentEncoding) == "gzip" {
if o.fs.opt.Decompress {
return readers.NewGzipReader(resp.Body)
}
o.fs.warnCompressed.Do(func() {
fs.Logf(o, "Not decompressing 'Content-Encoding: gzip' compressed file. Use --s3-decompress to override")
})
}
return resp.Body, nil
}

View file

@ -4,6 +4,7 @@ import (
"bytes"
"compress/gzip"
"context"
"crypto/md5"
"fmt"
"testing"
"time"
@ -11,6 +12,7 @@ import (
"github.com/aws/aws-sdk-go/aws"
"github.com/aws/aws-sdk-go/service/s3"
"github.com/rclone/rclone/fs"
"github.com/rclone/rclone/fs/hash"
"github.com/rclone/rclone/fstest"
"github.com/rclone/rclone/fstest/fstests"
"github.com/rclone/rclone/lib/random"
@ -29,9 +31,15 @@ func gz(t *testing.T, s string) string {
return buf.String()
}
func md5sum(t *testing.T, s string) string {
hash := md5.Sum([]byte(s))
return fmt.Sprintf("%x", hash)
}
func (f *Fs) InternalTestMetadata(t *testing.T) {
ctx := context.Background()
contents := gz(t, random.String(1000))
original := random.String(1000)
contents := gz(t, original)
item := fstest.NewItem("test-metadata", contents, fstest.Time("2001-05-06T04:05:06.499999999Z"))
btime := time.Now()
@ -68,6 +76,31 @@ func (f *Fs) InternalTestMetadata(t *testing.T) {
assert.Equal(t, v, got, k)
}
}
t.Run("GzipEncoding", func(t *testing.T) {
// Test that the gziped file we uploaded can be
// downloaded with and without decompression
checkDownload := func(wantContents string, wantSize int64, wantHash string) {
gotContents := fstests.ReadObject(ctx, t, o, -1)
assert.Equal(t, wantContents, gotContents)
assert.Equal(t, wantSize, o.Size())
gotHash, err := o.Hash(ctx, hash.MD5)
require.NoError(t, err)
assert.Equal(t, wantHash, gotHash)
}
t.Run("NoDecompress", func(t *testing.T) {
checkDownload(contents, int64(len(contents)), md5sum(t, contents))
})
t.Run("Decompress", func(t *testing.T) {
f.opt.Decompress = true
defer func() {
f.opt.Decompress = false
}()
checkDownload(original, -1, "")
})
})
}
func (f *Fs) InternalTestNoHead(t *testing.T) {