forked from TrueCloudLab/rclone
s3: add --s3-decompress flag to download gzip-encoded files
Before this change, if an object compressed with "Content-Encoding: gzip" was downloaded, a length and hash mismatch would occur since the go runtime automatically decompressed the object on download. If --s3-decompress is set, this change erases the length and hash on compressed objects so they can be downloaded successfully, at the cost of not being able to check the length or the hash of the downloaded object. If --s3-decompress is not set the compressed files will be downloaded as-is providing compressed objects with intact size and hash information. See #2658
This commit is contained in:
parent
1f5e7ce598
commit
ebe86c6cec
2 changed files with 93 additions and 19 deletions
|
@ -2004,6 +2004,19 @@ See [the time option docs](/docs/#time-option) for valid formats.
|
|||
`,
|
||||
Default: fs.Time{},
|
||||
Advanced: true,
|
||||
}, {
|
||||
Name: "decompress",
|
||||
Help: `If set this will decompress gzip encoded objects.
|
||||
|
||||
It is possible to upload objects to S3 with "Content-Encoding: gzip"
|
||||
set. Normally rclone will download these files files as compressed objects.
|
||||
|
||||
If this flag is set then rclone will decompress these files with
|
||||
"Content-Encoding: gzip" as they are received. This means that rclone
|
||||
can't check the size and hash but the file contents will be decompressed.
|
||||
`,
|
||||
Advanced: true,
|
||||
Default: false,
|
||||
},
|
||||
}})
|
||||
}
|
||||
|
@ -2128,28 +2141,30 @@ type Options struct {
|
|||
UsePresignedRequest bool `config:"use_presigned_request"`
|
||||
Versions bool `config:"versions"`
|
||||
VersionAt fs.Time `config:"version_at"`
|
||||
Decompress bool `config:"decompress"`
|
||||
}
|
||||
|
||||
// Fs represents a remote s3 server
|
||||
type Fs struct {
|
||||
name string // the name of the remote
|
||||
root string // root of the bucket - ignore all objects above this
|
||||
opt Options // parsed options
|
||||
ci *fs.ConfigInfo // global config
|
||||
ctx context.Context // global context for reading config
|
||||
features *fs.Features // optional features
|
||||
c *s3.S3 // the connection to the s3 server
|
||||
ses *session.Session // the s3 session
|
||||
rootBucket string // bucket part of root (if any)
|
||||
rootDirectory string // directory part of root (if any)
|
||||
cache *bucket.Cache // cache for bucket creation status
|
||||
pacer *fs.Pacer // To pace the API calls
|
||||
srv *http.Client // a plain http client
|
||||
srvRest *rest.Client // the rest connection to the server
|
||||
pool *pool.Pool // memory pool
|
||||
etagIsNotMD5 bool // if set ETags are not MD5s
|
||||
versioningMu sync.Mutex
|
||||
versioning fs.Tristate // if set bucket is using versions
|
||||
name string // the name of the remote
|
||||
root string // root of the bucket - ignore all objects above this
|
||||
opt Options // parsed options
|
||||
ci *fs.ConfigInfo // global config
|
||||
ctx context.Context // global context for reading config
|
||||
features *fs.Features // optional features
|
||||
c *s3.S3 // the connection to the s3 server
|
||||
ses *session.Session // the s3 session
|
||||
rootBucket string // bucket part of root (if any)
|
||||
rootDirectory string // directory part of root (if any)
|
||||
cache *bucket.Cache // cache for bucket creation status
|
||||
pacer *fs.Pacer // To pace the API calls
|
||||
srv *http.Client // a plain http client
|
||||
srvRest *rest.Client // the rest connection to the server
|
||||
pool *pool.Pool // memory pool
|
||||
etagIsNotMD5 bool // if set ETags are not MD5s
|
||||
versioningMu sync.Mutex
|
||||
versioning fs.Tristate // if set bucket is using versions
|
||||
warnCompressed sync.Once // warn once about compressed files
|
||||
}
|
||||
|
||||
// Object describes a s3 object
|
||||
|
@ -4318,6 +4333,10 @@ func (o *Object) Hash(ctx context.Context, t hash.Type) (string, error) {
|
|||
if t != hash.MD5 {
|
||||
return "", hash.ErrUnsupported
|
||||
}
|
||||
// If decompressing, erase the hash
|
||||
if o.bytes < 0 {
|
||||
return "", nil
|
||||
}
|
||||
// If we haven't got an MD5, then check the metadata
|
||||
if o.md5 == "" {
|
||||
err := o.readMetaData(ctx)
|
||||
|
@ -4439,6 +4458,12 @@ func (o *Object) setMetaData(resp *s3.HeadObjectOutput) {
|
|||
o.contentDisposition = resp.ContentDisposition
|
||||
o.contentEncoding = resp.ContentEncoding
|
||||
o.contentLanguage = resp.ContentLanguage
|
||||
|
||||
// If decompressing then size and md5sum are unknown
|
||||
if o.fs.opt.Decompress && aws.StringValue(o.contentEncoding) == "gzip" {
|
||||
o.bytes = -1
|
||||
o.md5 = ""
|
||||
}
|
||||
}
|
||||
|
||||
// ModTime returns the modification time of the object
|
||||
|
@ -4596,6 +4621,11 @@ func (o *Object) Open(ctx context.Context, options ...fs.OpenOption) (in io.Read
|
|||
}
|
||||
httpReq, resp := o.fs.c.GetObjectRequest(&req)
|
||||
fs.FixRangeOption(options, o.bytes)
|
||||
|
||||
// Override the automatic decompression in the transport to
|
||||
// download compressed files as-is
|
||||
httpReq.HTTPRequest.Header.Set("Accept-Encoding", "gzip")
|
||||
|
||||
for _, option := range options {
|
||||
switch option.(type) {
|
||||
case *fs.RangeOption, *fs.SeekOption:
|
||||
|
@ -4646,6 +4676,17 @@ func (o *Object) Open(ctx context.Context, options ...fs.OpenOption) (in io.Read
|
|||
setFrom_s3HeadObjectOutput_s3GetObjectOutput(&head, resp)
|
||||
head.ContentLength = size
|
||||
o.setMetaData(&head)
|
||||
|
||||
// Decompress body if necessary
|
||||
if aws.StringValue(resp.ContentEncoding) == "gzip" {
|
||||
if o.fs.opt.Decompress {
|
||||
return readers.NewGzipReader(resp.Body)
|
||||
}
|
||||
o.fs.warnCompressed.Do(func() {
|
||||
fs.Logf(o, "Not decompressing 'Content-Encoding: gzip' compressed file. Use --s3-decompress to override")
|
||||
})
|
||||
}
|
||||
|
||||
return resp.Body, nil
|
||||
}
|
||||
|
||||
|
|
|
@ -4,6 +4,7 @@ import (
|
|||
"bytes"
|
||||
"compress/gzip"
|
||||
"context"
|
||||
"crypto/md5"
|
||||
"fmt"
|
||||
"testing"
|
||||
"time"
|
||||
|
@ -11,6 +12,7 @@ import (
|
|||
"github.com/aws/aws-sdk-go/aws"
|
||||
"github.com/aws/aws-sdk-go/service/s3"
|
||||
"github.com/rclone/rclone/fs"
|
||||
"github.com/rclone/rclone/fs/hash"
|
||||
"github.com/rclone/rclone/fstest"
|
||||
"github.com/rclone/rclone/fstest/fstests"
|
||||
"github.com/rclone/rclone/lib/random"
|
||||
|
@ -29,9 +31,15 @@ func gz(t *testing.T, s string) string {
|
|||
return buf.String()
|
||||
}
|
||||
|
||||
func md5sum(t *testing.T, s string) string {
|
||||
hash := md5.Sum([]byte(s))
|
||||
return fmt.Sprintf("%x", hash)
|
||||
}
|
||||
|
||||
func (f *Fs) InternalTestMetadata(t *testing.T) {
|
||||
ctx := context.Background()
|
||||
contents := gz(t, random.String(1000))
|
||||
original := random.String(1000)
|
||||
contents := gz(t, original)
|
||||
|
||||
item := fstest.NewItem("test-metadata", contents, fstest.Time("2001-05-06T04:05:06.499999999Z"))
|
||||
btime := time.Now()
|
||||
|
@ -68,6 +76,31 @@ func (f *Fs) InternalTestMetadata(t *testing.T) {
|
|||
assert.Equal(t, v, got, k)
|
||||
}
|
||||
}
|
||||
|
||||
t.Run("GzipEncoding", func(t *testing.T) {
|
||||
// Test that the gziped file we uploaded can be
|
||||
// downloaded with and without decompression
|
||||
checkDownload := func(wantContents string, wantSize int64, wantHash string) {
|
||||
gotContents := fstests.ReadObject(ctx, t, o, -1)
|
||||
assert.Equal(t, wantContents, gotContents)
|
||||
assert.Equal(t, wantSize, o.Size())
|
||||
gotHash, err := o.Hash(ctx, hash.MD5)
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, wantHash, gotHash)
|
||||
}
|
||||
|
||||
t.Run("NoDecompress", func(t *testing.T) {
|
||||
checkDownload(contents, int64(len(contents)), md5sum(t, contents))
|
||||
})
|
||||
t.Run("Decompress", func(t *testing.T) {
|
||||
f.opt.Decompress = true
|
||||
defer func() {
|
||||
f.opt.Decompress = false
|
||||
}()
|
||||
checkDownload(original, -1, "")
|
||||
})
|
||||
|
||||
})
|
||||
}
|
||||
|
||||
func (f *Fs) InternalTestNoHead(t *testing.T) {
|
||||
|
|
Loading…
Reference in a new issue