chunker: option to hash all files

This commit is contained in:
Ivan Andreev 2019-10-09 13:24:03 +03:00 committed by Nick Craig-Wood
parent 9049bb62ca
commit 910c80bd02
3 changed files with 166 additions and 95 deletions

View file

@ -164,24 +164,28 @@ It has the following fields: ver, size, nchunks, md5, sha1.`,
Name: "hash_type",
Advanced: false,
Default: "md5",
Help: `Choose how chunker handles hash sums.`,
Help: `Choose how chunker handles hash sums. All modes but "none" require metadata.`,
Examples: []fs.OptionExample{{
Value: "none",
Help: `Chunker can pass any hash supported by wrapped remote
for non-chunked files but returns nothing otherwise.`,
Help: `Pass any hash supported by wrapped remote for non-chunked files, return nothing otherwise`,
}, {
Value: "md5",
Help: `MD5 for composite files. Requires "simplejson".`,
Help: `MD5 for composite files`,
}, {
Value: "sha1",
Help: `SHA1 for composite files. Requires "simplejson".`,
Help: `SHA1 for composite files`,
}, {
Value: "md5all",
Help: `MD5 for all files`,
}, {
Value: "sha1all",
Help: `SHA1 for all files`,
}, {
Value: "md5quick",
Help: `Copying a file to chunker will request MD5 from the source
falling back to SHA1 if unsupported. Requires "simplejson".`,
Help: `Copying a file to chunker will request MD5 from the source falling back to SHA1 if unsupported`,
}, {
Value: "sha1quick",
Help: `Similar to "md5quick" but prefers SHA1 over MD5. Requires "simplejson".`,
Help: `Similar to "md5quick" but prefers SHA1 over MD5`,
}},
}, {
Name: "fail_hard",
@ -240,38 +244,8 @@ func NewFs(name, rpath string, m configmap.Mapper) (fs.Fs, error) {
}
f.dirSort = true // processEntries requires that meta Objects prerun data chunks atm.
switch opt.MetaFormat {
case "none":
f.useMeta = false
case "simplejson":
f.useMeta = true
default:
return nil, fmt.Errorf("unsupported meta format '%s'", opt.MetaFormat)
}
requireMetaHash := true
switch opt.HashType {
case "none":
requireMetaHash = false
case "md5":
f.useMD5 = true
case "sha1":
f.useSHA1 = true
case "md5quick":
f.useMD5 = true
f.quickHash = true
case "sha1quick":
f.useSHA1 = true
f.quickHash = true
default:
return nil, fmt.Errorf("unsupported hash type '%s'", opt.HashType)
}
if requireMetaHash && opt.MetaFormat != "simplejson" {
return nil, fmt.Errorf("hash type '%s' requires meta format 'simplejson'", opt.HashType)
}
if err := f.setChunkNameFormat(opt.NameFormat); err != nil {
return nil, errors.Wrapf(err, "invalid name format '%s'", opt.NameFormat)
if err := f.configure(opt.NameFormat, opt.MetaFormat, opt.HashType); err != nil {
return nil, err
}
// Handle the tricky case detected by FsMkdir/FsPutFiles/FsIsFile
@ -317,20 +291,87 @@ type Options struct {
// Fs represents a wrapped fs.Fs
type Fs struct {
name string
root string
base fs.Fs // remote wrapped by chunker overlay
wrapper fs.Fs // wrapper is used by SetWrapper
useMeta bool // false if metadata format is 'none'
useMD5 bool // mutually exclusive with useSHA1
useSHA1 bool // mutually exclusive with useMD5
quickHash bool // allows fallback from MD5 to SHA1 and vice versa
dataNameFmt string // name format of data chunks
ctrlNameFmt string // name format of control chunks
nameRegexp *regexp.Regexp // regular expression to match chunk names
opt Options // copy of Options
features *fs.Features // optional features
dirSort bool // reserved for future, ignored
name string
root string
base fs.Fs // remote wrapped by chunker overlay
wrapper fs.Fs // wrapper is used by SetWrapper
useMeta bool // false if metadata format is 'none'
useMD5 bool // mutually exclusive with useSHA1
useSHA1 bool // mutually exclusive with useMD5
hashFallback bool // allows fallback from MD5 to SHA1 and vice versa
hashAll bool // hash all files, mutually exclusive with hashFallback
dataNameFmt string // name format of data chunks
ctrlNameFmt string // name format of control chunks
nameRegexp *regexp.Regexp // regular expression to match chunk names
opt Options // copy of Options
features *fs.Features // optional features
dirSort bool // reserved for future, ignored
}
// configure must be called only from NewFs or by unit tests
func (f *Fs) configure(nameFormat, metaFormat, hashType string) error {
if err := f.setChunkNameFormat(nameFormat); err != nil {
return errors.Wrapf(err, "invalid name format '%s'", nameFormat)
}
if err := f.setMetaFormat(metaFormat); err != nil {
return err
}
if err := f.setHashType(hashType); err != nil {
return err
}
return nil
}
func (f *Fs) setMetaFormat(metaFormat string) error {
switch metaFormat {
case "none":
f.useMeta = false
case "simplejson":
f.useMeta = true
default:
return fmt.Errorf("unsupported meta format '%s'", metaFormat)
}
return nil
}
// setHashType
// must be called *after* setMetaFormat.
//
// In the "All" mode chunker will force metadata on all files
// if the wrapped remote can't provide given hashsum.
func (f *Fs) setHashType(hashType string) error {
f.useMD5 = false
f.useSHA1 = false
f.hashFallback = false
f.hashAll = false
requireMetaHash := true
switch hashType {
case "none":
requireMetaHash = false
case "md5":
f.useMD5 = true
case "sha1":
f.useSHA1 = true
case "md5quick":
f.useMD5 = true
f.hashFallback = true
case "sha1quick":
f.useSHA1 = true
f.hashFallback = true
case "md5all":
f.useMD5 = true
f.hashAll = !f.base.Hashes().Contains(hash.MD5)
case "sha1all":
f.useSHA1 = true
f.hashAll = !f.base.Hashes().Contains(hash.SHA1)
default:
return fmt.Errorf("unsupported hash type '%s'", hashType)
}
if requireMetaHash && !f.useMeta {
return fmt.Errorf("hash type '%s' requires compatible meta format", hashType)
}
return nil
}
// setChunkNameFormat converts pattern based chunk name format
@ -877,8 +918,8 @@ func (f *Fs) put(ctx context.Context, in io.Reader, src fs.ObjectInfo, remote st
// Finalize small object as non-chunked.
// This can be bypassed, and single chunk with metadata will be
// created due to unsafe input.
if !needMeta && f.useMeta {
// created if forced by consistent hashing or due to unsafe input.
if !needMeta && !f.hashAll && f.useMeta {
// If previous object was chunked, remove its chunks
f.removeOldChunks(ctx, baseRemote)
@ -989,7 +1030,7 @@ func (c *chunkingReader) wrapStream(ctx context.Context, in io.Reader, src fs.Ob
switch {
case c.fs.useMD5:
if c.md5, _ = src.Hash(ctx, hash.MD5); c.md5 == "" {
if c.fs.quickHash {
if c.fs.hashFallback {
c.sha1, _ = src.Hash(ctx, hash.SHA1)
} else {
c.hasher = md5.New()
@ -997,7 +1038,7 @@ func (c *chunkingReader) wrapStream(ctx context.Context, in io.Reader, src fs.Ob
}
case c.fs.useSHA1:
if c.sha1, _ = src.Hash(ctx, hash.SHA1); c.sha1 == "" {
if c.fs.quickHash {
if c.fs.hashFallback {
c.md5, _ = src.Hash(ctx, hash.MD5)
} else {
c.hasher = sha1.New()
@ -1157,11 +1198,11 @@ func (f *Fs) Precision() time.Duration {
// Chunker advertises a hash type if and only if it can be calculated
// for files of any size, non-chunked or composite.
func (f *Fs) Hashes() hash.Set {
// composites && all of them && small files supported by wrapped remote
if f.useMD5 && !f.quickHash && f.base.Hashes().Contains(hash.MD5) {
// composites AND no fallback AND (chunker OR wrapped Fs will hash all non-chunked's)
if f.useMD5 && !f.hashFallback && (f.hashAll || f.base.Hashes().Contains(hash.MD5)) {
return hash.NewHashSet(hash.MD5)
}
if f.useSHA1 && !f.quickHash && f.base.Hashes().Contains(hash.SHA1) {
if f.useSHA1 && !f.hashFallback && (f.hashAll || f.base.Hashes().Contains(hash.SHA1)) {
return hash.NewHashSet(hash.SHA1)
}
return hash.NewHashSet() // can't provide strong guarantees
@ -1383,14 +1424,14 @@ func (f *Fs) okForServerSide(ctx context.Context, src fs.Object, opName string)
case f.useMD5:
md5, _ = obj.Hash(ctx, hash.MD5)
ok = md5 != ""
if !ok && f.quickHash {
if !ok && f.hashFallback {
sha1, _ = obj.Hash(ctx, hash.SHA1)
ok = sha1 != ""
}
case f.useSHA1:
sha1, _ = obj.Hash(ctx, hash.SHA1)
ok = sha1 != ""
if !ok && f.quickHash {
if !ok && f.hashFallback {
md5, _ = obj.Hash(ctx, hash.MD5)
ok = md5 != ""
}
@ -1678,17 +1719,14 @@ func (o *Object) SetModTime(ctx context.Context, mtime time.Time) error {
// Hash returns the selected checksum of the file.
// If no checksum is available it returns "".
// If a particular hashsum type is not supported, chunker won't fail
// with `unsupported` error but return the empty hash string.
//
// Currently metadata (if not configured as 'none') is kept only for
// composite files, but for non-chunked small files chunker obtains
// hashsums from wrapped remote.
// If a particular hashsum type is not supported, chunker won't fail
// with `unsupported` error but return the empty hash string.
//
// In future metadata logic can be extended: if a normal (non-quick)
// hash type is configured, chunker will check whether wrapped remote
// supports it (see Fs.Hashes as an example). If not, it will add metadata
// to small files as well, thus providing hashsums for all files.
// In the "All" mode chunker will force metadata on all files if
// particular hashsum type is unsupported by wrapped remote.
//
func (o *Object) Hash(ctx context.Context, hashType hash.Type) (string, error) {
if !o.isComposite() {
@ -1976,9 +2014,10 @@ type metaSimpleJSON struct {
// marshalSimpleJSON
//
// Current implementation creates metadata in two cases:
// Current implementation creates metadata in three cases:
// - for files larger than chunk size
// - if file contents can be mistaken as meta object
// - if consistent hashing is on but wrapped remote can't provide given hash
//
func marshalSimpleJSON(ctx context.Context, size int64, nChunks int, md5, sha1 string) ([]byte, error) {
version := metadataVersion
@ -2000,13 +2039,9 @@ func marshalSimpleJSON(ctx context.Context, size int64, nChunks int, md5, sha1 s
}
// unmarshalSimpleJSON
// Note: only metadata format version 1 is supported atm.
//
// Current implementation creates metadata only for files larger than
// configured chunk size. This approach has drawback: availability of
// configured hashsum type for small files depends on the wrapped remote.
// Future versions of chunker may change approach as described in comment
// to the Hash method. They can transparently migrate older metadata.
// Only metadata format version 1 is supported atm.
// Future releases will transparently migrate older metadata objects.
// New format will have a higher version number and cannot be correctly
// handled by current implementation.
// The version check below will then explicitly ask user to upgrade rclone.

View file

@ -12,6 +12,7 @@ import (
"testing"
"github.com/rclone/rclone/fs"
"github.com/rclone/rclone/fs/hash"
"github.com/rclone/rclone/fs/operations"
"github.com/rclone/rclone/fstest"
"github.com/rclone/rclone/fstest/fstests"
@ -276,6 +277,11 @@ func testSmallFileInternals(t *testing.T, f *Fs) {
assert.Nil(t, o.main)
assert.True(t, o.isComposite()) // sorry, sometimes a name is misleading
assert.Equal(t, 1, len(o.chunks))
case f.hashAll:
// Consistent hashing forces meta object on small files too
assert.NotNil(t, o.main)
assert.True(t, o.isComposite())
assert.Equal(t, 1, len(o.chunks))
default:
// normally non-chunked file is kept in the Object's main field
assert.NotNil(t, o.main)
@ -300,6 +306,24 @@ func testSmallFileInternals(t *testing.T, f *Fs) {
_ = r.Close()
}
checkHashsum := func(obj fs.Object) {
var ht hash.Type
switch {
case !f.hashAll:
return
case f.useMD5:
ht = hash.MD5
case f.useSHA1:
ht = hash.SHA1
default:
return
}
// even empty files must have hashsum in consistent mode
sum, err := obj.Hash(ctx, ht)
assert.NoError(t, err)
assert.NotEqual(t, sum, "")
}
checkSmallFile := func(name, contents string) {
filename := path.Join(dir, name)
item := fstest.Item{Path: filename, ModTime: modTime}
@ -307,6 +331,7 @@ func testSmallFileInternals(t *testing.T, f *Fs) {
assert.NotNil(t, put)
checkSmallFileInternals(put)
checkContents(put, contents)
checkHashsum(put)
// objects returned by Put and NewObject must have similar structure
obj, err := f.NewObject(ctx, filename)
@ -314,6 +339,7 @@ func testSmallFileInternals(t *testing.T, f *Fs) {
assert.NotNil(t, obj)
checkSmallFileInternals(obj)
checkContents(obj, contents)
checkHashsum(obj)
_ = obj.Remove(ctx)
_ = put.Remove(ctx) // for good

View file

@ -46,20 +46,22 @@ remote> remote:path
Files larger than chunk size will be split in chunks.
Enter a size with suffix k,M,G,T. Press Enter for the default ("2G").
chunk_size> 100M
Choose how chunker handles hash sums.
Choose how chunker handles hash sums. All modes but "none" require metadata.
Enter a string value. Press Enter for the default ("md5").
Choose a number from below, or type in your own value
/ Chunker can pass any hash supported by wrapped remote
1 | for non-chunked files but returns nothing otherwise.
1 / Pass any hash supported by wrapped remote for non-chunked files, return nothing otherwise
\ "none"
2 / MD5 for composite files. Requires "simplejson".
2 / MD5 for composite files
\ "md5"
3 / SHA1 for composite files. Requires "simplejson".
3 / SHA1 for composite files
\ "sha1"
/ Copying a file to chunker will request MD5 from the source
4 | falling back to SHA1 if unsupported. Requires "simplejson".
4 / MD5 for all files
\ "md5all"
5 / SHA1 for all files
\ "sha1all"
6 / Copying a file to chunker will request MD5 from the source falling back to SHA1 if unsupported
\ "md5quick"
5 / Similar to "md5quick" but prefers SHA1 over MD5. Requires "simplejson".
7 / Similar to "md5quick" but prefers SHA1 over MD5
\ "sha1quick"
hash_type> md5
Edit advanced config? (y/n)
@ -190,8 +192,8 @@ Chunker supports hashsums only when a compatible metadata is present.
Hence, if you choose metadata format of `none`, chunker will report hashsum
as `UNSUPPORTED`.
Please note that metadata is stored only for composite files. If a file
is small (smaller than configured chunk size), chunker will transparently
Please note that by default metadata is stored only for composite files.
If a file is smaller than configured chunk size, chunker will transparently
redirect hash requests to wrapped remote, so support depends on that.
You will see the empty string as a hashsum of requested type for small
files if the wrapped remote doesn't support it.
@ -204,6 +206,12 @@ wrapped remote hash for non-chunked ones, we advise you to choose the same
hash type as supported by wrapped remote so that your file listings
look coherent.
If your storage backend does not support MD5 or SHA1 but you need consistent
file hashing, configure chunker with `md5all` or `sha1all`. These two modes
guarantee given hash for all files. If wrapped remote doesn't support it,
chunker will then add metadata to all files, even small. However, this can
double the amount of small files in storage and incur additional service charges.
Normally, when a file is copied to chunker controlled remote, chunker
will ask the file source for compatible file hash and revert to on-the-fly
calculation if none is found. This involves some CPU overhead but provides
@ -309,7 +317,7 @@ Files larger than chunk size will be split in chunks.
#### --chunker-hash-type
Choose how chunker handles hash sums.
Choose how chunker handles hash sums. All modes but "none" require metadata.
- Config: hash_type
- Env Var: RCLONE_CHUNKER_HASH_TYPE
@ -317,17 +325,19 @@ Choose how chunker handles hash sums.
- Default: "md5"
- Examples:
- "none"
- Chunker can pass any hash supported by wrapped remote
- for non-chunked files but returns nothing otherwise.
- Pass any hash supported by wrapped remote for non-chunked files, return nothing otherwise
- "md5"
- MD5 for composite files. Requires "simplejson".
- MD5 for composite files
- "sha1"
- SHA1 for composite files. Requires "simplejson".
- SHA1 for composite files
- "md5all"
- MD5 for all files
- "sha1all"
- SHA1 for all files
- "md5quick"
- Copying a file to chunker will request MD5 from the source
- falling back to SHA1 if unsupported. Requires "simplejson".
- Copying a file to chunker will request MD5 from the source falling back to SHA1 if unsupported
- "sha1quick"
- Similar to "md5quick" but prefers SHA1 over MD5. Requires "simplejson".
- Similar to "md5quick" but prefers SHA1 over MD5
### Advanced Options