forked from TrueCloudLab/rclone
chunker: option to hash all files
This commit is contained in:
parent
9049bb62ca
commit
910c80bd02
3 changed files with 166 additions and 95 deletions
|
@ -164,24 +164,28 @@ It has the following fields: ver, size, nchunks, md5, sha1.`,
|
|||
Name: "hash_type",
|
||||
Advanced: false,
|
||||
Default: "md5",
|
||||
Help: `Choose how chunker handles hash sums.`,
|
||||
Help: `Choose how chunker handles hash sums. All modes but "none" require metadata.`,
|
||||
Examples: []fs.OptionExample{{
|
||||
Value: "none",
|
||||
Help: `Chunker can pass any hash supported by wrapped remote
|
||||
for non-chunked files but returns nothing otherwise.`,
|
||||
Help: `Pass any hash supported by wrapped remote for non-chunked files, return nothing otherwise`,
|
||||
}, {
|
||||
Value: "md5",
|
||||
Help: `MD5 for composite files. Requires "simplejson".`,
|
||||
Help: `MD5 for composite files`,
|
||||
}, {
|
||||
Value: "sha1",
|
||||
Help: `SHA1 for composite files. Requires "simplejson".`,
|
||||
Help: `SHA1 for composite files`,
|
||||
}, {
|
||||
Value: "md5all",
|
||||
Help: `MD5 for all files`,
|
||||
}, {
|
||||
Value: "sha1all",
|
||||
Help: `SHA1 for all files`,
|
||||
}, {
|
||||
Value: "md5quick",
|
||||
Help: `Copying a file to chunker will request MD5 from the source
|
||||
falling back to SHA1 if unsupported. Requires "simplejson".`,
|
||||
Help: `Copying a file to chunker will request MD5 from the source falling back to SHA1 if unsupported`,
|
||||
}, {
|
||||
Value: "sha1quick",
|
||||
Help: `Similar to "md5quick" but prefers SHA1 over MD5. Requires "simplejson".`,
|
||||
Help: `Similar to "md5quick" but prefers SHA1 over MD5`,
|
||||
}},
|
||||
}, {
|
||||
Name: "fail_hard",
|
||||
|
@ -240,38 +244,8 @@ func NewFs(name, rpath string, m configmap.Mapper) (fs.Fs, error) {
|
|||
}
|
||||
f.dirSort = true // processEntries requires that meta Objects prerun data chunks atm.
|
||||
|
||||
switch opt.MetaFormat {
|
||||
case "none":
|
||||
f.useMeta = false
|
||||
case "simplejson":
|
||||
f.useMeta = true
|
||||
default:
|
||||
return nil, fmt.Errorf("unsupported meta format '%s'", opt.MetaFormat)
|
||||
}
|
||||
|
||||
requireMetaHash := true
|
||||
switch opt.HashType {
|
||||
case "none":
|
||||
requireMetaHash = false
|
||||
case "md5":
|
||||
f.useMD5 = true
|
||||
case "sha1":
|
||||
f.useSHA1 = true
|
||||
case "md5quick":
|
||||
f.useMD5 = true
|
||||
f.quickHash = true
|
||||
case "sha1quick":
|
||||
f.useSHA1 = true
|
||||
f.quickHash = true
|
||||
default:
|
||||
return nil, fmt.Errorf("unsupported hash type '%s'", opt.HashType)
|
||||
}
|
||||
if requireMetaHash && opt.MetaFormat != "simplejson" {
|
||||
return nil, fmt.Errorf("hash type '%s' requires meta format 'simplejson'", opt.HashType)
|
||||
}
|
||||
|
||||
if err := f.setChunkNameFormat(opt.NameFormat); err != nil {
|
||||
return nil, errors.Wrapf(err, "invalid name format '%s'", opt.NameFormat)
|
||||
if err := f.configure(opt.NameFormat, opt.MetaFormat, opt.HashType); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// Handle the tricky case detected by FsMkdir/FsPutFiles/FsIsFile
|
||||
|
@ -317,20 +291,87 @@ type Options struct {
|
|||
|
||||
// Fs represents a wrapped fs.Fs
|
||||
type Fs struct {
|
||||
name string
|
||||
root string
|
||||
base fs.Fs // remote wrapped by chunker overlay
|
||||
wrapper fs.Fs // wrapper is used by SetWrapper
|
||||
useMeta bool // false if metadata format is 'none'
|
||||
useMD5 bool // mutually exclusive with useSHA1
|
||||
useSHA1 bool // mutually exclusive with useMD5
|
||||
quickHash bool // allows fallback from MD5 to SHA1 and vice versa
|
||||
dataNameFmt string // name format of data chunks
|
||||
ctrlNameFmt string // name format of control chunks
|
||||
nameRegexp *regexp.Regexp // regular expression to match chunk names
|
||||
opt Options // copy of Options
|
||||
features *fs.Features // optional features
|
||||
dirSort bool // reserved for future, ignored
|
||||
name string
|
||||
root string
|
||||
base fs.Fs // remote wrapped by chunker overlay
|
||||
wrapper fs.Fs // wrapper is used by SetWrapper
|
||||
useMeta bool // false if metadata format is 'none'
|
||||
useMD5 bool // mutually exclusive with useSHA1
|
||||
useSHA1 bool // mutually exclusive with useMD5
|
||||
hashFallback bool // allows fallback from MD5 to SHA1 and vice versa
|
||||
hashAll bool // hash all files, mutually exclusive with hashFallback
|
||||
dataNameFmt string // name format of data chunks
|
||||
ctrlNameFmt string // name format of control chunks
|
||||
nameRegexp *regexp.Regexp // regular expression to match chunk names
|
||||
opt Options // copy of Options
|
||||
features *fs.Features // optional features
|
||||
dirSort bool // reserved for future, ignored
|
||||
}
|
||||
|
||||
// configure must be called only from NewFs or by unit tests
|
||||
func (f *Fs) configure(nameFormat, metaFormat, hashType string) error {
|
||||
if err := f.setChunkNameFormat(nameFormat); err != nil {
|
||||
return errors.Wrapf(err, "invalid name format '%s'", nameFormat)
|
||||
}
|
||||
if err := f.setMetaFormat(metaFormat); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := f.setHashType(hashType); err != nil {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (f *Fs) setMetaFormat(metaFormat string) error {
|
||||
switch metaFormat {
|
||||
case "none":
|
||||
f.useMeta = false
|
||||
case "simplejson":
|
||||
f.useMeta = true
|
||||
default:
|
||||
return fmt.Errorf("unsupported meta format '%s'", metaFormat)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// setHashType
|
||||
// must be called *after* setMetaFormat.
|
||||
//
|
||||
// In the "All" mode chunker will force metadata on all files
|
||||
// if the wrapped remote can't provide given hashsum.
|
||||
func (f *Fs) setHashType(hashType string) error {
|
||||
f.useMD5 = false
|
||||
f.useSHA1 = false
|
||||
f.hashFallback = false
|
||||
f.hashAll = false
|
||||
requireMetaHash := true
|
||||
|
||||
switch hashType {
|
||||
case "none":
|
||||
requireMetaHash = false
|
||||
case "md5":
|
||||
f.useMD5 = true
|
||||
case "sha1":
|
||||
f.useSHA1 = true
|
||||
case "md5quick":
|
||||
f.useMD5 = true
|
||||
f.hashFallback = true
|
||||
case "sha1quick":
|
||||
f.useSHA1 = true
|
||||
f.hashFallback = true
|
||||
case "md5all":
|
||||
f.useMD5 = true
|
||||
f.hashAll = !f.base.Hashes().Contains(hash.MD5)
|
||||
case "sha1all":
|
||||
f.useSHA1 = true
|
||||
f.hashAll = !f.base.Hashes().Contains(hash.SHA1)
|
||||
default:
|
||||
return fmt.Errorf("unsupported hash type '%s'", hashType)
|
||||
}
|
||||
if requireMetaHash && !f.useMeta {
|
||||
return fmt.Errorf("hash type '%s' requires compatible meta format", hashType)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// setChunkNameFormat converts pattern based chunk name format
|
||||
|
@ -877,8 +918,8 @@ func (f *Fs) put(ctx context.Context, in io.Reader, src fs.ObjectInfo, remote st
|
|||
|
||||
// Finalize small object as non-chunked.
|
||||
// This can be bypassed, and single chunk with metadata will be
|
||||
// created due to unsafe input.
|
||||
if !needMeta && f.useMeta {
|
||||
// created if forced by consistent hashing or due to unsafe input.
|
||||
if !needMeta && !f.hashAll && f.useMeta {
|
||||
// If previous object was chunked, remove its chunks
|
||||
f.removeOldChunks(ctx, baseRemote)
|
||||
|
||||
|
@ -989,7 +1030,7 @@ func (c *chunkingReader) wrapStream(ctx context.Context, in io.Reader, src fs.Ob
|
|||
switch {
|
||||
case c.fs.useMD5:
|
||||
if c.md5, _ = src.Hash(ctx, hash.MD5); c.md5 == "" {
|
||||
if c.fs.quickHash {
|
||||
if c.fs.hashFallback {
|
||||
c.sha1, _ = src.Hash(ctx, hash.SHA1)
|
||||
} else {
|
||||
c.hasher = md5.New()
|
||||
|
@ -997,7 +1038,7 @@ func (c *chunkingReader) wrapStream(ctx context.Context, in io.Reader, src fs.Ob
|
|||
}
|
||||
case c.fs.useSHA1:
|
||||
if c.sha1, _ = src.Hash(ctx, hash.SHA1); c.sha1 == "" {
|
||||
if c.fs.quickHash {
|
||||
if c.fs.hashFallback {
|
||||
c.md5, _ = src.Hash(ctx, hash.MD5)
|
||||
} else {
|
||||
c.hasher = sha1.New()
|
||||
|
@ -1157,11 +1198,11 @@ func (f *Fs) Precision() time.Duration {
|
|||
// Chunker advertises a hash type if and only if it can be calculated
|
||||
// for files of any size, non-chunked or composite.
|
||||
func (f *Fs) Hashes() hash.Set {
|
||||
// composites && all of them && small files supported by wrapped remote
|
||||
if f.useMD5 && !f.quickHash && f.base.Hashes().Contains(hash.MD5) {
|
||||
// composites AND no fallback AND (chunker OR wrapped Fs will hash all non-chunked's)
|
||||
if f.useMD5 && !f.hashFallback && (f.hashAll || f.base.Hashes().Contains(hash.MD5)) {
|
||||
return hash.NewHashSet(hash.MD5)
|
||||
}
|
||||
if f.useSHA1 && !f.quickHash && f.base.Hashes().Contains(hash.SHA1) {
|
||||
if f.useSHA1 && !f.hashFallback && (f.hashAll || f.base.Hashes().Contains(hash.SHA1)) {
|
||||
return hash.NewHashSet(hash.SHA1)
|
||||
}
|
||||
return hash.NewHashSet() // can't provide strong guarantees
|
||||
|
@ -1383,14 +1424,14 @@ func (f *Fs) okForServerSide(ctx context.Context, src fs.Object, opName string)
|
|||
case f.useMD5:
|
||||
md5, _ = obj.Hash(ctx, hash.MD5)
|
||||
ok = md5 != ""
|
||||
if !ok && f.quickHash {
|
||||
if !ok && f.hashFallback {
|
||||
sha1, _ = obj.Hash(ctx, hash.SHA1)
|
||||
ok = sha1 != ""
|
||||
}
|
||||
case f.useSHA1:
|
||||
sha1, _ = obj.Hash(ctx, hash.SHA1)
|
||||
ok = sha1 != ""
|
||||
if !ok && f.quickHash {
|
||||
if !ok && f.hashFallback {
|
||||
md5, _ = obj.Hash(ctx, hash.MD5)
|
||||
ok = md5 != ""
|
||||
}
|
||||
|
@ -1678,17 +1719,14 @@ func (o *Object) SetModTime(ctx context.Context, mtime time.Time) error {
|
|||
|
||||
// Hash returns the selected checksum of the file.
|
||||
// If no checksum is available it returns "".
|
||||
// If a particular hashsum type is not supported, chunker won't fail
|
||||
// with `unsupported` error but return the empty hash string.
|
||||
//
|
||||
// Currently metadata (if not configured as 'none') is kept only for
|
||||
// composite files, but for non-chunked small files chunker obtains
|
||||
// hashsums from wrapped remote.
|
||||
// If a particular hashsum type is not supported, chunker won't fail
|
||||
// with `unsupported` error but return the empty hash string.
|
||||
//
|
||||
// In future metadata logic can be extended: if a normal (non-quick)
|
||||
// hash type is configured, chunker will check whether wrapped remote
|
||||
// supports it (see Fs.Hashes as an example). If not, it will add metadata
|
||||
// to small files as well, thus providing hashsums for all files.
|
||||
// In the "All" mode chunker will force metadata on all files if
|
||||
// particular hashsum type is unsupported by wrapped remote.
|
||||
//
|
||||
func (o *Object) Hash(ctx context.Context, hashType hash.Type) (string, error) {
|
||||
if !o.isComposite() {
|
||||
|
@ -1976,9 +2014,10 @@ type metaSimpleJSON struct {
|
|||
|
||||
// marshalSimpleJSON
|
||||
//
|
||||
// Current implementation creates metadata in two cases:
|
||||
// Current implementation creates metadata in three cases:
|
||||
// - for files larger than chunk size
|
||||
// - if file contents can be mistaken as meta object
|
||||
// - if consistent hashing is on but wrapped remote can't provide given hash
|
||||
//
|
||||
func marshalSimpleJSON(ctx context.Context, size int64, nChunks int, md5, sha1 string) ([]byte, error) {
|
||||
version := metadataVersion
|
||||
|
@ -2000,13 +2039,9 @@ func marshalSimpleJSON(ctx context.Context, size int64, nChunks int, md5, sha1 s
|
|||
}
|
||||
|
||||
// unmarshalSimpleJSON
|
||||
// Note: only metadata format version 1 is supported atm.
|
||||
//
|
||||
// Current implementation creates metadata only for files larger than
|
||||
// configured chunk size. This approach has drawback: availability of
|
||||
// configured hashsum type for small files depends on the wrapped remote.
|
||||
// Future versions of chunker may change approach as described in comment
|
||||
// to the Hash method. They can transparently migrate older metadata.
|
||||
// Only metadata format version 1 is supported atm.
|
||||
// Future releases will transparently migrate older metadata objects.
|
||||
// New format will have a higher version number and cannot be correctly
|
||||
// handled by current implementation.
|
||||
// The version check below will then explicitly ask user to upgrade rclone.
|
||||
|
|
|
@ -12,6 +12,7 @@ import (
|
|||
"testing"
|
||||
|
||||
"github.com/rclone/rclone/fs"
|
||||
"github.com/rclone/rclone/fs/hash"
|
||||
"github.com/rclone/rclone/fs/operations"
|
||||
"github.com/rclone/rclone/fstest"
|
||||
"github.com/rclone/rclone/fstest/fstests"
|
||||
|
@ -276,6 +277,11 @@ func testSmallFileInternals(t *testing.T, f *Fs) {
|
|||
assert.Nil(t, o.main)
|
||||
assert.True(t, o.isComposite()) // sorry, sometimes a name is misleading
|
||||
assert.Equal(t, 1, len(o.chunks))
|
||||
case f.hashAll:
|
||||
// Consistent hashing forces meta object on small files too
|
||||
assert.NotNil(t, o.main)
|
||||
assert.True(t, o.isComposite())
|
||||
assert.Equal(t, 1, len(o.chunks))
|
||||
default:
|
||||
// normally non-chunked file is kept in the Object's main field
|
||||
assert.NotNil(t, o.main)
|
||||
|
@ -300,6 +306,24 @@ func testSmallFileInternals(t *testing.T, f *Fs) {
|
|||
_ = r.Close()
|
||||
}
|
||||
|
||||
checkHashsum := func(obj fs.Object) {
|
||||
var ht hash.Type
|
||||
switch {
|
||||
case !f.hashAll:
|
||||
return
|
||||
case f.useMD5:
|
||||
ht = hash.MD5
|
||||
case f.useSHA1:
|
||||
ht = hash.SHA1
|
||||
default:
|
||||
return
|
||||
}
|
||||
// even empty files must have hashsum in consistent mode
|
||||
sum, err := obj.Hash(ctx, ht)
|
||||
assert.NoError(t, err)
|
||||
assert.NotEqual(t, sum, "")
|
||||
}
|
||||
|
||||
checkSmallFile := func(name, contents string) {
|
||||
filename := path.Join(dir, name)
|
||||
item := fstest.Item{Path: filename, ModTime: modTime}
|
||||
|
@ -307,6 +331,7 @@ func testSmallFileInternals(t *testing.T, f *Fs) {
|
|||
assert.NotNil(t, put)
|
||||
checkSmallFileInternals(put)
|
||||
checkContents(put, contents)
|
||||
checkHashsum(put)
|
||||
|
||||
// objects returned by Put and NewObject must have similar structure
|
||||
obj, err := f.NewObject(ctx, filename)
|
||||
|
@ -314,6 +339,7 @@ func testSmallFileInternals(t *testing.T, f *Fs) {
|
|||
assert.NotNil(t, obj)
|
||||
checkSmallFileInternals(obj)
|
||||
checkContents(obj, contents)
|
||||
checkHashsum(obj)
|
||||
|
||||
_ = obj.Remove(ctx)
|
||||
_ = put.Remove(ctx) // for good
|
||||
|
|
|
@ -46,20 +46,22 @@ remote> remote:path
|
|||
Files larger than chunk size will be split in chunks.
|
||||
Enter a size with suffix k,M,G,T. Press Enter for the default ("2G").
|
||||
chunk_size> 100M
|
||||
Choose how chunker handles hash sums.
|
||||
Choose how chunker handles hash sums. All modes but "none" require metadata.
|
||||
Enter a string value. Press Enter for the default ("md5").
|
||||
Choose a number from below, or type in your own value
|
||||
/ Chunker can pass any hash supported by wrapped remote
|
||||
1 | for non-chunked files but returns nothing otherwise.
|
||||
1 / Pass any hash supported by wrapped remote for non-chunked files, return nothing otherwise
|
||||
\ "none"
|
||||
2 / MD5 for composite files. Requires "simplejson".
|
||||
2 / MD5 for composite files
|
||||
\ "md5"
|
||||
3 / SHA1 for composite files. Requires "simplejson".
|
||||
3 / SHA1 for composite files
|
||||
\ "sha1"
|
||||
/ Copying a file to chunker will request MD5 from the source
|
||||
4 | falling back to SHA1 if unsupported. Requires "simplejson".
|
||||
4 / MD5 for all files
|
||||
\ "md5all"
|
||||
5 / SHA1 for all files
|
||||
\ "sha1all"
|
||||
6 / Copying a file to chunker will request MD5 from the source falling back to SHA1 if unsupported
|
||||
\ "md5quick"
|
||||
5 / Similar to "md5quick" but prefers SHA1 over MD5. Requires "simplejson".
|
||||
7 / Similar to "md5quick" but prefers SHA1 over MD5
|
||||
\ "sha1quick"
|
||||
hash_type> md5
|
||||
Edit advanced config? (y/n)
|
||||
|
@ -190,8 +192,8 @@ Chunker supports hashsums only when a compatible metadata is present.
|
|||
Hence, if you choose metadata format of `none`, chunker will report hashsum
|
||||
as `UNSUPPORTED`.
|
||||
|
||||
Please note that metadata is stored only for composite files. If a file
|
||||
is small (smaller than configured chunk size), chunker will transparently
|
||||
Please note that by default metadata is stored only for composite files.
|
||||
If a file is smaller than configured chunk size, chunker will transparently
|
||||
redirect hash requests to wrapped remote, so support depends on that.
|
||||
You will see the empty string as a hashsum of requested type for small
|
||||
files if the wrapped remote doesn't support it.
|
||||
|
@ -204,6 +206,12 @@ wrapped remote hash for non-chunked ones, we advise you to choose the same
|
|||
hash type as supported by wrapped remote so that your file listings
|
||||
look coherent.
|
||||
|
||||
If your storage backend does not support MD5 or SHA1 but you need consistent
|
||||
file hashing, configure chunker with `md5all` or `sha1all`. These two modes
|
||||
guarantee given hash for all files. If wrapped remote doesn't support it,
|
||||
chunker will then add metadata to all files, even small. However, this can
|
||||
double the amount of small files in storage and incur additional service charges.
|
||||
|
||||
Normally, when a file is copied to chunker controlled remote, chunker
|
||||
will ask the file source for compatible file hash and revert to on-the-fly
|
||||
calculation if none is found. This involves some CPU overhead but provides
|
||||
|
@ -309,7 +317,7 @@ Files larger than chunk size will be split in chunks.
|
|||
|
||||
#### --chunker-hash-type
|
||||
|
||||
Choose how chunker handles hash sums.
|
||||
Choose how chunker handles hash sums. All modes but "none" require metadata.
|
||||
|
||||
- Config: hash_type
|
||||
- Env Var: RCLONE_CHUNKER_HASH_TYPE
|
||||
|
@ -317,17 +325,19 @@ Choose how chunker handles hash sums.
|
|||
- Default: "md5"
|
||||
- Examples:
|
||||
- "none"
|
||||
- Chunker can pass any hash supported by wrapped remote
|
||||
- for non-chunked files but returns nothing otherwise.
|
||||
- Pass any hash supported by wrapped remote for non-chunked files, return nothing otherwise
|
||||
- "md5"
|
||||
- MD5 for composite files. Requires "simplejson".
|
||||
- MD5 for composite files
|
||||
- "sha1"
|
||||
- SHA1 for composite files. Requires "simplejson".
|
||||
- SHA1 for composite files
|
||||
- "md5all"
|
||||
- MD5 for all files
|
||||
- "sha1all"
|
||||
- SHA1 for all files
|
||||
- "md5quick"
|
||||
- Copying a file to chunker will request MD5 from the source
|
||||
- falling back to SHA1 if unsupported. Requires "simplejson".
|
||||
- Copying a file to chunker will request MD5 from the source falling back to SHA1 if unsupported
|
||||
- "sha1quick"
|
||||
- Similar to "md5quick" but prefers SHA1 over MD5. Requires "simplejson".
|
||||
- Similar to "md5quick" but prefers SHA1 over MD5
|
||||
|
||||
### Advanced Options
|
||||
|
||||
|
|
Loading…
Reference in a new issue