s3: add --s3-decompress flag to download gzip-encoded files

Before this change, if an object compressed with "Content-Encoding:
gzip" was downloaded, a length and hash mismatch would occur since the
go runtime automatically decompressed the object on download.

If --s3-decompress is set, this change erases the length and hash on
compressed objects so they can be downloaded successfully, at the cost
of not being able to check the length or the hash of the downloaded
object.

If --s3-decompress is not set the compressed files will be downloaded
as-is providing compressed objects with intact size and hash
information.

See #2658
This commit is contained in:
Nick Craig-Wood 2022-07-29 17:01:59 +01:00
parent 1f5e7ce598
commit ebe86c6cec
2 changed files with 93 additions and 19 deletions

View file

@ -2004,6 +2004,19 @@ See [the time option docs](/docs/#time-option) for valid formats.
`, `,
Default: fs.Time{}, Default: fs.Time{},
Advanced: true, Advanced: true,
}, {
Name: "decompress",
Help: `If set this will decompress gzip encoded objects.
It is possible to upload objects to S3 with "Content-Encoding: gzip"
set. Normally rclone will download these files files as compressed objects.
If this flag is set then rclone will decompress these files with
"Content-Encoding: gzip" as they are received. This means that rclone
can't check the size and hash but the file contents will be decompressed.
`,
Advanced: true,
Default: false,
}, },
}}) }})
} }
@ -2128,28 +2141,30 @@ type Options struct {
UsePresignedRequest bool `config:"use_presigned_request"` UsePresignedRequest bool `config:"use_presigned_request"`
Versions bool `config:"versions"` Versions bool `config:"versions"`
VersionAt fs.Time `config:"version_at"` VersionAt fs.Time `config:"version_at"`
Decompress bool `config:"decompress"`
} }
// Fs represents a remote s3 server // Fs represents a remote s3 server
type Fs struct { type Fs struct {
name string // the name of the remote name string // the name of the remote
root string // root of the bucket - ignore all objects above this root string // root of the bucket - ignore all objects above this
opt Options // parsed options opt Options // parsed options
ci *fs.ConfigInfo // global config ci *fs.ConfigInfo // global config
ctx context.Context // global context for reading config ctx context.Context // global context for reading config
features *fs.Features // optional features features *fs.Features // optional features
c *s3.S3 // the connection to the s3 server c *s3.S3 // the connection to the s3 server
ses *session.Session // the s3 session ses *session.Session // the s3 session
rootBucket string // bucket part of root (if any) rootBucket string // bucket part of root (if any)
rootDirectory string // directory part of root (if any) rootDirectory string // directory part of root (if any)
cache *bucket.Cache // cache for bucket creation status cache *bucket.Cache // cache for bucket creation status
pacer *fs.Pacer // To pace the API calls pacer *fs.Pacer // To pace the API calls
srv *http.Client // a plain http client srv *http.Client // a plain http client
srvRest *rest.Client // the rest connection to the server srvRest *rest.Client // the rest connection to the server
pool *pool.Pool // memory pool pool *pool.Pool // memory pool
etagIsNotMD5 bool // if set ETags are not MD5s etagIsNotMD5 bool // if set ETags are not MD5s
versioningMu sync.Mutex versioningMu sync.Mutex
versioning fs.Tristate // if set bucket is using versions versioning fs.Tristate // if set bucket is using versions
warnCompressed sync.Once // warn once about compressed files
} }
// Object describes a s3 object // Object describes a s3 object
@ -4318,6 +4333,10 @@ func (o *Object) Hash(ctx context.Context, t hash.Type) (string, error) {
if t != hash.MD5 { if t != hash.MD5 {
return "", hash.ErrUnsupported return "", hash.ErrUnsupported
} }
// If decompressing, erase the hash
if o.bytes < 0 {
return "", nil
}
// If we haven't got an MD5, then check the metadata // If we haven't got an MD5, then check the metadata
if o.md5 == "" { if o.md5 == "" {
err := o.readMetaData(ctx) err := o.readMetaData(ctx)
@ -4439,6 +4458,12 @@ func (o *Object) setMetaData(resp *s3.HeadObjectOutput) {
o.contentDisposition = resp.ContentDisposition o.contentDisposition = resp.ContentDisposition
o.contentEncoding = resp.ContentEncoding o.contentEncoding = resp.ContentEncoding
o.contentLanguage = resp.ContentLanguage o.contentLanguage = resp.ContentLanguage
// If decompressing then size and md5sum are unknown
if o.fs.opt.Decompress && aws.StringValue(o.contentEncoding) == "gzip" {
o.bytes = -1
o.md5 = ""
}
} }
// ModTime returns the modification time of the object // ModTime returns the modification time of the object
@ -4596,6 +4621,11 @@ func (o *Object) Open(ctx context.Context, options ...fs.OpenOption) (in io.Read
} }
httpReq, resp := o.fs.c.GetObjectRequest(&req) httpReq, resp := o.fs.c.GetObjectRequest(&req)
fs.FixRangeOption(options, o.bytes) fs.FixRangeOption(options, o.bytes)
// Override the automatic decompression in the transport to
// download compressed files as-is
httpReq.HTTPRequest.Header.Set("Accept-Encoding", "gzip")
for _, option := range options { for _, option := range options {
switch option.(type) { switch option.(type) {
case *fs.RangeOption, *fs.SeekOption: case *fs.RangeOption, *fs.SeekOption:
@ -4646,6 +4676,17 @@ func (o *Object) Open(ctx context.Context, options ...fs.OpenOption) (in io.Read
setFrom_s3HeadObjectOutput_s3GetObjectOutput(&head, resp) setFrom_s3HeadObjectOutput_s3GetObjectOutput(&head, resp)
head.ContentLength = size head.ContentLength = size
o.setMetaData(&head) o.setMetaData(&head)
// Decompress body if necessary
if aws.StringValue(resp.ContentEncoding) == "gzip" {
if o.fs.opt.Decompress {
return readers.NewGzipReader(resp.Body)
}
o.fs.warnCompressed.Do(func() {
fs.Logf(o, "Not decompressing 'Content-Encoding: gzip' compressed file. Use --s3-decompress to override")
})
}
return resp.Body, nil return resp.Body, nil
} }

View file

@ -4,6 +4,7 @@ import (
"bytes" "bytes"
"compress/gzip" "compress/gzip"
"context" "context"
"crypto/md5"
"fmt" "fmt"
"testing" "testing"
"time" "time"
@ -11,6 +12,7 @@ import (
"github.com/aws/aws-sdk-go/aws" "github.com/aws/aws-sdk-go/aws"
"github.com/aws/aws-sdk-go/service/s3" "github.com/aws/aws-sdk-go/service/s3"
"github.com/rclone/rclone/fs" "github.com/rclone/rclone/fs"
"github.com/rclone/rclone/fs/hash"
"github.com/rclone/rclone/fstest" "github.com/rclone/rclone/fstest"
"github.com/rclone/rclone/fstest/fstests" "github.com/rclone/rclone/fstest/fstests"
"github.com/rclone/rclone/lib/random" "github.com/rclone/rclone/lib/random"
@ -29,9 +31,15 @@ func gz(t *testing.T, s string) string {
return buf.String() return buf.String()
} }
func md5sum(t *testing.T, s string) string {
hash := md5.Sum([]byte(s))
return fmt.Sprintf("%x", hash)
}
func (f *Fs) InternalTestMetadata(t *testing.T) { func (f *Fs) InternalTestMetadata(t *testing.T) {
ctx := context.Background() ctx := context.Background()
contents := gz(t, random.String(1000)) original := random.String(1000)
contents := gz(t, original)
item := fstest.NewItem("test-metadata", contents, fstest.Time("2001-05-06T04:05:06.499999999Z")) item := fstest.NewItem("test-metadata", contents, fstest.Time("2001-05-06T04:05:06.499999999Z"))
btime := time.Now() btime := time.Now()
@ -68,6 +76,31 @@ func (f *Fs) InternalTestMetadata(t *testing.T) {
assert.Equal(t, v, got, k) assert.Equal(t, v, got, k)
} }
} }
t.Run("GzipEncoding", func(t *testing.T) {
// Test that the gziped file we uploaded can be
// downloaded with and without decompression
checkDownload := func(wantContents string, wantSize int64, wantHash string) {
gotContents := fstests.ReadObject(ctx, t, o, -1)
assert.Equal(t, wantContents, gotContents)
assert.Equal(t, wantSize, o.Size())
gotHash, err := o.Hash(ctx, hash.MD5)
require.NoError(t, err)
assert.Equal(t, wantHash, gotHash)
}
t.Run("NoDecompress", func(t *testing.T) {
checkDownload(contents, int64(len(contents)), md5sum(t, contents))
})
t.Run("Decompress", func(t *testing.T) {
f.opt.Decompress = true
defer func() {
f.opt.Decompress = false
}()
checkDownload(original, -1, "")
})
})
} }
func (f *Fs) InternalTestNoHead(t *testing.T) { func (f *Fs) InternalTestNoHead(t *testing.T) {