chunker: finish meta-format before release

changes:
- chunker: remove GetTier and SetTier
- remove wdmrcompat metaformat
- remove fastopen strategy
- make hash_type option non-advanced
- adverise hash support when possible
- add metadata field "ver", run strict checks
- describe internal behavior in comments
- improve documentation

note:
wdmrcompat used to write file name in the metadata, so maximum metadata
size was 1K; removing it allows to cap size by 200 bytes now.
This commit is contained in:
Ivan Andreev 2019-09-25 02:18:30 +03:00 committed by Nick Craig-Wood
parent c41812fc88
commit ccecfa9cb1
5 changed files with 303 additions and 312 deletions

View file

@ -36,13 +36,11 @@ const (
// WARNING: this optimization is not transaction safe!
optimizeFirstChunk = false
// Normally metadata is a small (less than 1KB) piece of JSON.
// Normally metadata is a small (100-200 bytes) piece of JSON.
// Valid metadata size should not exceed this limit.
maxMetaDataSize = 1023
maxMetaDataSize = 199
// fastopen strategy opens all chunks immediately, but reads sequentially.
// linear strategy opens and reads chunks sequentially, without read-ahead.
downloadStrategy = "linear"
metaDataVersion = 1
)
// Formatting of temporary chunk names. Temporary suffix *follows* chunk
@ -52,6 +50,13 @@ var (
tempChunkRegexp = regexp.MustCompile(`^(.+)\.\.tmp_([0-9]{10,19})$`)
)
// Note: metadata logic is tightly coupled with chunker code in many
// places of the code, eg. in checks whether a file can have meta object
// or is eligible for chunking.
// If more metadata formats (or versions of a format) are added in future,
// it may be advisable to factor it into a "metadata strategy" interface
// similar to chunkingReader or linearReader below.
// Register with Fs
func init() {
fs.Register(&fs.RegInfo{
@ -98,16 +103,10 @@ Metadata is a small JSON file named after the composite file.`,
Value: "simplejson",
Help: `Simple JSON supports hash sums and chunk validation.
It has the following fields: size, nchunks, md5, sha1.`,
}, {
Value: "wdmrcompat",
Help: `This format brings compatibility with WebDavMailRuCloud.
It does not support hash sums or validation, most fields are ignored.
It has the following fields: Name, Size, PublicKey, CreationDate.
Requires hash type "none".`,
}},
}, {
Name: "hash_type",
Advanced: true,
Advanced: false,
Default: "md5",
Help: `Choose how chunker handles hash sums.`,
Examples: []fs.OptionExample{{
@ -122,8 +121,8 @@ for a single-chunk file but returns nothing otherwise.`,
Help: `SHA1 for multi-chunk files. Requires "simplejson".`,
}, {
Value: "md5quick",
Help: `When a file is copied on to chunker, MD5 is taken from its source
falling back to SHA1 if the source doesn't support it. Requires "simplejson".`,
Help: `Copying a file to chunker will request MD5 from the source
falling back to SHA1 if unsupported. Requires "simplejson".`,
}, {
Value: "sha1quick",
Help: `Similar to "md5quick" but prefers SHA1 over MD5. Requires "simplejson".`,
@ -188,7 +187,7 @@ func NewFs(name, rpath string, m configmap.Mapper) (fs.Fs, error) {
switch opt.MetaFormat {
case "none":
f.useMeta = false
case "simplejson", "wdmrcompat":
case "simplejson":
f.useMeta = true
default:
return nil, fmt.Errorf("unsupported meta format '%s'", opt.MetaFormat)
@ -243,8 +242,6 @@ func NewFs(name, rpath string, m configmap.Mapper) (fs.Fs, error) {
WriteMimeType: true,
BucketBased: true,
CanHaveEmptyDirectories: true,
SetTier: true,
GetTier: true,
ServerSideAcrossConfigs: true,
}).Fill(f).Mask(baseFs).WrapsFs(f, baseFs)
@ -393,6 +390,19 @@ func (f *Fs) parseChunkName(name string) (mainName string, chunkNo int, tempNo i
//
// This should return ErrDirNotFound if the directory isn't
// found.
//
// Commands normally cleanup all temporary chunks in case of a failure.
// However, if rclone dies unexpectedly, it can leave behind a bunch of
// hidden temporary chunks. List and its underlying chunkEntries()
// silently skip all temporary chunks in the directory. It's okay if
// they belong to an unfinished command running in parallel.
//
// However, there is no way to discover dead temporary chunks a.t.m.
// As a workaround users can use `purge` to forcibly remove the whole
// directory together with dead chunks.
// In future a flag named like `--chunker-list-hidden` may be added to
// rclone that will tell List to reveal hidden chunks.
//
func (f *Fs) List(ctx context.Context, dir string) (entries fs.DirEntries, err error) {
entries, err = f.base.List(ctx, dir)
if err != nil {
@ -428,7 +438,8 @@ func (f *Fs) ListR(ctx context.Context, dir string, callback fs.ListRCallback) (
})
}
// Add some directory entries. This alters entries returning it as newEntries.
// chunkEntries is called by List(R). It merges chunk entries from
// wrapped remote into composite directory entries.
func (f *Fs) chunkEntries(ctx context.Context, origEntries fs.DirEntries, hardErrors bool) (chunkedEntries fs.DirEntries, err error) {
// sort entries, so that meta objects (if any) appear before their chunks
sortedEntries := make(fs.DirEntries, len(origEntries))
@ -514,6 +525,11 @@ func (f *Fs) chunkEntries(ctx context.Context, origEntries fs.DirEntries, hardEr
}
// NewObject finds the Object at remote.
//
// Please note that every NewObject invocation will scan the whole directory.
// Using here something like fs.DirCache might improve performance (and make
// logic more complex though).
//
func (f *Fs) NewObject(ctx context.Context, remote string) (fs.Object, error) {
if mainRemote, _, _ := f.parseChunkName(remote); mainRemote != "" {
return nil, fmt.Errorf("%q should be meta object, not a chunk", remote)
@ -622,23 +638,14 @@ func (o *Object) readMetaData(ctx context.Context) error {
case "simplejson":
metaInfo, err := unmarshalSimpleJSON(ctx, metaObject, metaData)
if err != nil {
// TODO: maybe it's a small single chunk?
return err
// TODO: in a rare case we might mistake a small file for metadata
return errors.Wrap(err, "invalid metadata")
}
if o.size != metaInfo.Size() || len(o.chunks) != metaInfo.nChunks {
return errors.New("invalid simplejson metadata")
return errors.New("metadata doesn't match file size")
}
o.md5 = metaInfo.md5
o.sha1 = metaInfo.sha1
case "wdmrcompat":
metaInfo, err := unmarshalWDMRCompat(ctx, metaObject, metaData)
if err != nil {
// TODO: maybe it's a small single chunk?
return err
}
if o.size != metaInfo.Size() {
return errors.New("invalid wdmrcompat metadata")
}
}
o.isFull = true
@ -784,9 +791,6 @@ func (f *Fs) put(ctx context.Context, in io.Reader, src fs.ObjectInfo, remote st
case "simplejson":
c.updateHashes()
metaData, err = marshalSimpleJSON(ctx, sizeTotal, len(c.chunks), c.md5, c.sha1)
case "wdmrcompat":
fileInfo := f.wrapInfo(src, baseRemote, sizeTotal)
metaData, err = marshalWDMRCompat(ctx, fileInfo)
}
if err == nil {
metaInfo := f.wrapInfo(src, baseRemote, int64(len(metaData)))
@ -951,6 +955,9 @@ func (f *Fs) PutStream(ctx context.Context, in io.Reader, src fs.ObjectInfo, opt
// Update in to the object with the modTime given of the given size
func (o *Object) Update(ctx context.Context, in io.Reader, src fs.ObjectInfo, options ...fs.OpenOption) error {
if err := o.readMetaData(ctx); err != nil {
return err
}
basePut := o.f.base.Put
if src.Size() < 0 {
basePut = o.f.base.Features().PutStream
@ -989,8 +996,17 @@ func (f *Fs) Precision() time.Duration {
}
// Hashes returns the supported hash sets.
// Chunker advertises a hash type if and only if it can be calculated
// for files of any size, multi-chunked or small.
func (f *Fs) Hashes() hash.Set {
return hash.Set(hash.None)
// composites && all of them && small files supported by wrapped remote
if f.useMD5 && !f.quickHash && f.base.Hashes().Contains(hash.MD5) {
return hash.NewHashSet(hash.MD5)
}
if f.useSHA1 && !f.quickHash && f.base.Hashes().Contains(hash.SHA1) {
return hash.NewHashSet(hash.SHA1)
}
return hash.NewHashSet() // can't provide strong guarantees
}
// Mkdir makes the directory (container, bucket)
@ -1012,7 +1028,12 @@ func (f *Fs) Rmdir(ctx context.Context, dir string) error {
// Implement this if you have a way of deleting all the files
// quicker than just running Remove() on the result of List()
//
// Return an error if it doesn't exist
// Return an error if it doesn't exist.
//
// This command will chain to `purge` from wrapped remote.
// As a result it removes not only chunker files with their
// active chunks but also all hidden chunks in the directory.
//
func (f *Fs) Purge(ctx context.Context) error {
do := f.base.Features().Purge
if do == nil {
@ -1021,7 +1042,25 @@ func (f *Fs) Purge(ctx context.Context) error {
return do(ctx)
}
// Remove an object
// Remove an object (chunks and metadata, if any)
//
// Remove deletes only active chunks of the object.
// It does not try to look for temporary chunks because they could belong
// to another command modifying this composite file in parallel.
//
// Commands normally cleanup all temporary chunks in case of a failure.
// However, if rclone dies unexpectedly, it can leave hidden temporary
// chunks, which cannot be discovered using the `list` command.
// Remove does not try to search for such chunks or delete them.
// Sometimes this can lead to strange results eg. when `list` shows that
// directory is empty but `rmdir` refuses to remove it because on the
// level of wrapped remote it's actually *not* empty.
// As a workaround users can use `purge` to forcibly remove it.
//
// In future, a flag `--chunker-delete-hidden` may be added which tells
// Remove to search directory for hidden chunks and remove them too
// (at the risk of breaking parallel commands).
//
func (o *Object) Remove(ctx context.Context) (err error) {
if o.main != nil {
err = o.main.Remove(ctx)
@ -1095,13 +1134,6 @@ func (f *Fs) copyOrMove(ctx context.Context, o *Object, remote string, do copyMo
metaInfo := f.wrapInfo(metaObject, "", int64(len(metaData)))
err = newObj.main.Update(ctx, bytes.NewReader(metaData), metaInfo)
}
case "wdmrcompat":
newInfo := f.wrapInfo(metaObject, "", newObj.size)
metaData, err = marshalWDMRCompat(ctx, newInfo)
if err == nil {
metaInfo := f.wrapInfo(metaObject, "", int64(len(metaData)))
err = newObj.main.Update(ctx, bytes.NewReader(metaData), metaInfo)
}
case "none":
if newObj.main != nil {
err = newObj.main.Remove(ctx)
@ -1436,7 +1468,22 @@ func (o *Object) SetModTime(ctx context.Context, mtime time.Time) error {
// Hash returns the selected checksum of the file.
// If no checksum is available it returns "".
// It prefers the wrapped hashsum for a non-chunked file, then tries saved one.
//
// Hash prefers wrapped hashsum for a non-chunked file, then tries to
// read it from metadata. This in theory handles an unusual case when
// a small file is modified on the lower level by wrapped remote
// but chunker is not yet aware of changes.
//
// Currently metadata (if not configured as 'none') is kept only for
// multi-chunk files, but for small files chunker obtains hashsums from
// wrapped remote. If a particular hashsum type is not supported,
// chunker won't fail with `unsupported` error but return empty hash.
//
// In future metadata logic can be extended: if a normal (non-quick)
// hash type is configured, chunker will check whether wrapped remote
// supports it (see Fs.Hashes as an example). If not, it will add metadata
// to small files as well, thus providing hashsums for all files.
//
func (o *Object) Hash(ctx context.Context, hashType hash.Type) (string, error) {
if !o.isChunked() {
// First, chain to the single wrapped chunk, if possible.
@ -1500,78 +1547,10 @@ func (o *Object) Open(ctx context.Context, options ...fs.OpenOption) (rc io.Read
limit = o.size - offset
}
switch downloadStrategy {
case "linear":
return o.newLinearReader(ctx, offset, limit, openOptions)
case "fastopen":
return o.newFastopenReader(ctx, offset, limit, openOptions)
default:
return nil, errors.New("invalid download strategy")
}
}
// fastopenReader opens all chunks immediately, but reads sequentlially
type fastopenReader struct {
readClosers []io.ReadCloser
multiReader io.Reader
}
func (o *Object) newFastopenReader(ctx context.Context, offset, limit int64, options []fs.OpenOption) (io.ReadCloser, error) {
var (
readers []io.Reader
readClosers []io.ReadCloser
)
for _, chunk := range o.chunks {
if limit <= 0 {
break
}
count := chunk.Size()
if offset >= count {
offset -= count
continue
}
count -= offset
if limit < count {
count = limit
}
end := offset + count - 1
chunkOptions := append(options, &fs.RangeOption{Start: offset, End: end})
rc, err := chunk.Open(ctx, chunkOptions...)
if err != nil {
r := fastopenReader{readClosers: readClosers}
_ = r.Close() // ignore error
return nil, err
}
readClosers = append(readClosers, rc)
readers = append(readers, rc)
offset = 0
limit -= count
}
r := &fastopenReader{
readClosers: readClosers,
multiReader: io.MultiReader(readers...),
}
return r, nil
}
func (r *fastopenReader) Read(p []byte) (n int, err error) {
return r.multiReader.Read(p)
}
func (r *fastopenReader) Close() (err error) {
for _, rc := range r.readClosers {
chunkErr := rc.Close()
if err == nil {
err = chunkErr
}
}
return
}
// linearReader opens and reads chunks sequentially, without read-ahead
// linearReader opens and reads file chunks sequentially, without read-ahead
type linearReader struct {
ctx context.Context
chunks []fs.Object
@ -1771,25 +1750,9 @@ func (o *Object) ID() string {
return ""
}
// SetTier performs changing storage tier of the Object if
// multiple storage classes supported
func (o *Object) SetTier(tier string) error {
if doer, ok := o.mainChunk().(fs.SetTierer); ok {
return doer.SetTier(tier)
}
return errors.New("chunker: wrapped remote does not support SetTier")
}
// GetTier returns storage tier or class of the Object
func (o *Object) GetTier() string {
if doer, ok := o.mainChunk().(fs.GetTierer); ok {
return doer.GetTier()
}
return ""
}
// Meta format `simplejson`
type metaSimpleJSON struct {
Version int `json:"ver"`
Size int64 `json:"size"`
NChunks int `json:"nchunks"`
MD5 string `json:"md5"`
@ -1798,6 +1761,7 @@ type metaSimpleJSON struct {
func marshalSimpleJSON(ctx context.Context, size int64, nChunks int, md5, sha1 string) (data []byte, err error) {
metaData := &metaSimpleJSON{
Version: metaDataVersion,
Size: size,
NChunks: nChunks,
MD5: md5,
@ -1806,47 +1770,56 @@ func marshalSimpleJSON(ctx context.Context, size int64, nChunks int, md5, sha1 s
return json.Marshal(&metaData)
}
// Note: only metadata format version 1 is supported a.t.m.
//
// Current implementation creates metadata only for files larger than
// configured chunk size. This approach has drawback: availability of
// configured hashsum type for small files depends on the wrapped remote.
// Future versions of chunker may change approach as described in comment
// to the Hash method. They can transparently migrate older metadata.
// New format will have a higher version number and cannot be correctly
// hanled by current implementation.
// The version check below will then explicitly ask user to upgrade rclone.
//
func unmarshalSimpleJSON(ctx context.Context, metaObject fs.Object, data []byte) (info *ObjectInfo, err error) {
var metaData *metaSimpleJSON
err = json.Unmarshal(data, &metaData)
if err != nil {
return
return nil, err
}
// Perform strict checks, avoid corruption of future metadata formats.
if metaData.Size < 0 {
return nil, errors.New("negative file size")
}
if metaData.NChunks <= 0 {
return nil, errors.New("wrong number of chunks")
}
if metaData.MD5 != "" {
_, err = hex.DecodeString(metaData.MD5)
if len(metaData.MD5) != 32 || err != nil {
return nil, errors.New("wrong md5 hash")
}
}
if metaData.SHA1 != "" {
_, err = hex.DecodeString(metaData.SHA1)
if len(metaData.SHA1) != 40 || err != nil {
return nil, errors.New("wrong sha1 hash")
}
}
if metaData.Version <= 0 {
return nil, errors.New("wrong version number")
}
if metaData.Version != metaDataVersion {
return nil, errors.Errorf("version %d is not supported, please upgrade rclone", metaData.Version)
}
var nilFs *Fs // nil object triggers appropriate type method
info = nilFs.wrapInfo(metaObject, "", metaData.Size)
info.md5 = metaData.MD5
info.sha1 = metaData.SHA1
info.nChunks = metaData.NChunks
return
}
// Meta format `wdmrcompat`
type metaWDMRCompat struct {
Name string `json:"Name"`
Size int64 `json:"Size"`
PublicKey interface{} `json:"PublicKey"` // ignored, can be nil
CreationDate time.Time `json:"CreationDate"` // modification time, ignored
}
func marshalWDMRCompat(ctx context.Context, srcInfo fs.ObjectInfo) (data []byte, err error) {
metaData := &metaWDMRCompat{
Name: path.Base(srcInfo.Remote()),
Size: srcInfo.Size(),
PublicKey: nil,
CreationDate: srcInfo.ModTime(ctx).UTC(),
}
return json.Marshal(&metaData)
}
func unmarshalWDMRCompat(ctx context.Context, metaObject fs.Object, data []byte) (info *ObjectInfo, err error) {
var metaData *metaWDMRCompat
err = json.Unmarshal(data, &metaData)
if err != nil {
return
}
var nilFs *Fs // nil object triggers appropriate type method
info = nilFs.wrapInfo(metaObject, "", metaData.Size)
return
return info, nil
}
// Check the interfaces are satisfied
@ -1868,6 +1841,4 @@ var (
_ fs.Object = (*Object)(nil)
_ fs.ObjectUnWrapper = (*Object)(nil)
_ fs.IDer = (*Object)(nil)
_ fs.SetTierer = (*Object)(nil)
_ fs.GetTierer = (*Object)(nil)
)

View file

@ -31,7 +31,11 @@ func TestIntegration(t *testing.T) {
RemoteName: *fstest.RemoteName,
NilObject: (*chunker.Object)(nil),
SkipBadWindowsCharacters: !*UseBadChars,
UnimplementableObjectMethods: []string{"MimeType"},
UnimplementableObjectMethods: []string{
"MimeType",
"GetTier",
"SetTier",
},
UnimplementableFsMethods: []string{
"PublicLink",
"OpenWriterAt",

View file

@ -4,11 +4,11 @@ description: "Split-chunking overlay remote"
date: "2019-08-30"
---
<i class="fa fa-cut"></i>Chunker
<i class="fa fa-cut"></i>Chunker (BETA)
----------------------------------------
The `chunker` overlay transparently splits large files into smaller chunks
during the upload to wrapped remote and transparently assembles them back
during upload to wrapped remote and transparently assembles them back
when the file is downloaded. This allows to effectively overcome size limits
imposed by storage providers.
@ -41,10 +41,27 @@ Storage> chunker
Remote to chunk/unchunk.
Normally should contain a ':' and a path, eg "myremote:path/to/dir",
"myremote:bucket" or maybe "myremote:" (not recommended).
Enter a string value. Press Enter for the default ("").
remote> remote:path
Files larger than chunk_size will be split in chunks. By default 2 Gb.
Files larger than chunk size will be split in chunks.
Enter a size with suffix k,M,G,T. Press Enter for the default ("2G").
chunk_size> 1G
chunk_size> 100M
Choose how chunker handles hash sums.
Enter a string value. Press Enter for the default ("md5").
Choose a number from below, or type in your own value
/ Chunker can pass any hash supported by wrapped remote
1 | for a single-chunk file but returns nothing otherwise.
\ "none"
2 / MD5 for multi-chunk files. Requires "simplejson".
\ "md5"
3 / SHA1 for multi-chunk files. Requires "simplejson".
\ "sha1"
/ Copying a file to chunker will request MD5 from the source
4 | falling back to SHA1 if unsupported. Requires "simplejson".
\ "md5quick"
5 / Similar to "md5quick" but prefers SHA1 over MD5. Requires "simplejson".
\ "sha1quick"
hash_type> md5
Edit advanced config? (y/n)
y) Yes
n) No
@ -53,8 +70,9 @@ Remote config
--------------------
[overlay]
type = chunker
remote = TestLocal:
chunk_size = 2G
remote = remote:bucket
chunk_size = 100M
hash_type = md5
--------------------
y) Yes this is OK
e) Edit this remote
@ -73,8 +91,8 @@ will put files in a directory called `name` in the current directory.
### Chunking
When rclone starts a file upload, chunker checks the file size.
If it doesn't exceed the configured chunk size, chunker will just pass it
When rclone starts a file upload, chunker checks the file size. If it
doesn't exceed the configured chunk size, chunker will just pass the file
to the wrapped remote. If a file is large, chunker will transparently cut
data in pieces with temporary names and stream them one by one, on the fly.
Each chunk will contain the specified number of data byts, except for the
@ -84,7 +102,7 @@ a temporary copy, record its size and repeat the above process.
When upload completes, temporary chunk files are finally renamed.
This scheme guarantees that operations look from outside as atomic.
A similar method with hidden temporary chunks is used for other operations
(copy/move/rename etc). If operation fails, hidden chunks are normally
(copy/move/rename etc). If an operation fails, hidden chunks are normally
destroyed, and the destination composite file stays intact.
#### Chunk names
@ -94,58 +112,52 @@ By default chunk names are `BIG_FILE_NAME.rclone-chunk.001`,
format is `*.rclone-chunk.###`. You can configure another name format
using the `--chunker-name-format` option. The format uses asterisk
`*` as a placeholder for the base file name and one or more consecutive
hash characters `#` as a placeholder for the chunk number. There must be
one and only one asterisk. The number of consecutive hashes defines the
minimum length of a string representing a chunk number. If a chunk number
has less digits than the number of hashes, it is left-padded by zeros.
If there are more digits in the number, they are left as is.
hash characters `#` as a placeholder for sequential chunk number.
There must be one and only one asterisk. The number of consecutive hash
characters defines the minimum length of a string representing a chunk number.
If decimal chunk number has less digits than the number of hashes, it is
left-padded by zeros. If the number stringis longer, it is left intact.
By default numbering starts from 1 but there is another option that allows
user to start from 0, eg. for compatibility with legacy software.
For example, if name format is `big_*-##.part`, and original file was
named `data.txt` and numbering starts from 0, then the first chunk will be
named `big_data.txt-00.part`, the 99th chunk will be `big_data.txt-98.part`
and the 302nd chunk will be `big_data.txt-301.part`.
For example, if name format is `big_*-##.part` and original file name is
`data.txt` and numbering starts from 0, then the first chunk will be named
`big_data.txt-00.part`, the 99th chunk will be `big_data.txt-98.part`
and the 302nd chunk will become `big_data.txt-301.part`.
Would-be chunk files are ignored if their name does not match given format.
The list command might encounter composite files with missinng or invalid
chunks. By default, if chunker detects a missing chunk it will silently
ignore the whole group. Use the `--chunker-fail-on-bad-chunks` flag
to make it fail with an error message.
When the `list` rclone command scans a directory on wrapped remote, the
potential chunk files are accounted for and merged into composite directory
entries only if their names match the configured format. All other files
are ignored, including temporary chunks.
The list command might encounter composite files with missing or invalid
chunks. If chunker detects a missing chunk it will by default silently
ignore the whole group. You can use the `--chunker-fail-on-bad-chunks`
command line flag to make `list` fail with an error message.
### Metadata
By default when a file is large enough, chunker will create a metadata
object besides data chunks. The object is named after the original file.
Chunker allows to choose between few metadata formats. Please note that
currently metadata is not created for files smaller than configured
chunk size. This may change in future as new formats are developed.
Chunker allows user to disable metadata completely (the `none` format).
Please note that currently metadata is not created for files smaller
than configured chunk size. This may change in future as new formats
are developed.
#### Simple JSON metadata format
This is the default format. It supports hash sums and chunk validation
for composite files. Meta objects carry the following fields:
- `size` - total size of chunks
- `nchunks` - number of chunks
- `md5` - MD5 hashsum (if present)
- `ver` - version of format, currently `1`
- `size` - total size of composite file
- `nchunks` - number of chunks in the file
- `md5` - MD5 hashsum of composite file (if present)
- `sha1` - SHA1 hashsum (if present)
There is no field for composite file name as it's simply equal to the name
of meta object on the wrapped remote. Please refer to respective sections
for detils on hashsums and modified time handling.
#### WedDavMailRu compatible metadata format
The `wdmrcompat` metadata format is only useful to support historical files
created by [WebDriveMailru](https://github.com/yar229/WebDavMailRuCloud).
It keeps the following fields (most are ignored, though):
- `Name` - name of the composite file (always equal to the meta file name)
- `Size` - total size of chunks
- `PublicKey` - ignored, always "null"
- `CreationDate` - last modification (sic!) time, ignored.
for detils on hashsums and handling of modified time.
#### No metadata
@ -161,8 +173,8 @@ errors (especially missing last chunk) than metadata-enabled formats.
### Hashsums
Chunker supports hashsums only when a compatible metadata is present.
Thus, if you choose metadata format of `none` or `wdmrcompat`, chunker
will return `UNSUPPORTED` as hashsum.
Thus, if you choose metadata format of `none`, chunker will return
`UNSUPPORTED` as hashsum.
Please note that metadata is stored only for composite files. If a file
is small (smaller than configured chunk size), chunker will transparently
@ -175,16 +187,16 @@ Currently you can choose one or another but not both.
MD5 is set by default as the most supported type.
Since chunker keeps hashes for composite files and falls back to the
wrapped remote hash for small ones, we advise you to choose the same
hash type as wrapped remote, so your file listings look coherent.
hash type as wrapped remote so that your file listings look coherent.
Normally, when a file is copied to chunker controlled remote, chunker
will ask its source for compatible file hash and revert to on-the-fly
Normally, when a file is copied to a chunker controlled remote, chunker
will ask the file source for compatible file hash and revert to on-the-fly
calculation if none is found. This involves some CPU overhead but provides
a guarantee that given hashsum is available. Also, chunker will reject
a server-side copy or move operation if source and destination hashsum
types are different, resulting in the extra network bandwidth, too.
In some rare cases this may be undesired, so chunker provides two optional
choices: `sha1quick` and `md5quick`. If source does not have the primary
choices: `sha1quick` and `md5quick`. If the source does not support primary
hash type and the quick mode is enabled, chunker will try to fall back to
the secondary type. This will save CPU and bandwidth but can result in empty
hashsums at destination. Beware of consequences: the `sync` command will
@ -215,13 +227,14 @@ chunk naming scheme is to:
hash type, chunk naming etc.
- Now run `rclone sync oldchunks: newchunks:` and all your data
will be transparently converted at transfer.
This may take some time.
This may take some time, yet chunker will try server-side
copy if possible.
- After checking data integrity you may remove configuration section
of the old remote.
If rclone gets killed during a long operation on a big composite file,
hidden temporary chunks may stay in the directory. They will not be
shown by the list command but will eat up your account quota.
shown by the `list` command but will eat up your account quota.
Please note that the `deletefile` rclone command deletes only active
chunks of a file. As a workaround, you can use remote of the wrapped
file system to see them.
@ -234,17 +247,18 @@ remove everything including garbage.
### Caveats and Limitations
Chunker requires wrapped remote to support server side `move` (or `copy` +
delete) operations, otherwise it will explicitly refuse to start.
`delete`) operations, otherwise it will explicitly refuse to start.
This is because it internally renames temporary chunk files to their final
names when an operation completes successfully.
Note that moves done using the copy-and-delete method may incur double
charging with some cloud storage providers.
Note that a move implemented using the copy-and-delete method may incur
double charging with some cloud storage providers.
Chunker will not automatically rename existing chunks when you change the
chunk name format. Beware that in result of this some files which have been
treated as chunks before the change can pop up in directory listings as
normal files and vice versa. The same warning holds for the chunk size.
Chunker will not automatically rename existing chunks when you run
`rclone config` on a live remote and change the chunk name format.
Beware that in result of this some files which have been treated as chunks
before the change can pop up in directory listings as normal files
and vice versa. The same warning holds for the chunk size.
If you desperately need to change critical chunking setings, you should
run data migration as described in a dedicated section.
@ -278,6 +292,28 @@ Files larger than chunk size will be split in chunks.
- Type: SizeSuffix
- Default: 2G
#### --chunker-hash-type
Choose how chunker handles hash sums.
- Config: hash_type
- Env Var: RCLONE_CHUNKER_HASH_TYPE
- Type: string
- Default: "md5"
- Examples:
- "none"
- Chunker can pass any hash supported by wrapped remote
- for a single-chunk file but returns nothing otherwise.
- "md5"
- MD5 for multi-chunk files. Requires "simplejson".
- "sha1"
- SHA1 for multi-chunk files. Requires "simplejson".
- "md5quick"
- Copying a file to chunker will request MD5 from the source
- falling back to SHA1 if unsupported. Requires "simplejson".
- "sha1quick"
- Similar to "md5quick" but prefers SHA1 over MD5. Requires "simplejson".
### Advanced Options
Here are the advanced options specific to chunker (Transparently chunk/split large files).
@ -321,33 +357,6 @@ Metadata is a small JSON file named after the composite file.
- "simplejson"
- Simple JSON supports hash sums and chunk validation.
- It has the following fields: size, nchunks, md5, sha1.
- "wdmrcompat"
- This format brings compatibility with WebDavMailRuCloud.
- It does not support hash sums or validation, most fields are ignored.
- It has the following fields: Name, Size, PublicKey, CreationDate.
- Requires hash type "none".
#### --chunker-hash-type
Choose how chunker handles hash sums.
- Config: hash_type
- Env Var: RCLONE_CHUNKER_HASH_TYPE
- Type: string
- Default: "md5"
- Examples:
- "none"
- Chunker can pass any hash supported by wrapped remote
- for a single-chunk file but returns nothing otherwise.
- "md5"
- MD5 for multi-chunk files. Requires "simplejson".
- "sha1"
- SHA1 for multi-chunk files. Requires "simplejson".
- "md5quick"
- When a file is copied on to chunker, MD5 is taken from its source
- falling back to SHA1 if the source doesn't support it. Requires "simplejson".
- "sha1quick"
- Similar to "md5quick" but prefers SHA1 over MD5. Requires "simplejson".
#### --chunker-fail-on-bad-chunks

View file

@ -986,7 +986,6 @@ func TestSyncWithTrackRenames(t *testing.T) {
fs.Config.TrackRenames = true
defer func() {
fs.Config.TrackRenames = false
}()
haveHash := r.Fremote.Hashes().Overlap(r.Flocal.Hashes()).GetOne() != hash.None
@ -1010,45 +1009,64 @@ func TestSyncWithTrackRenames(t *testing.T) {
fstest.CheckItems(t, r.Fremote, f1, f2)
if canTrackRenames {
if r.Fremote.Features().Move == nil || r.Fremote.Name() == "TestUnion" { // union remote can Move but returns CantMove error
// If no server side Move, we are falling back to Copy + Delete
assert.Equal(t, int64(1), accounting.GlobalStats().GetTransfers()) // 1 copy
assert.Equal(t, int64(4), accounting.GlobalStats().GetChecks()) // 2 file checks + 1 move + 1 delete
} else {
assert.Equal(t, int64(0), accounting.GlobalStats().GetTransfers()) // 0 copy
assert.Equal(t, int64(3), accounting.GlobalStats().GetChecks()) // 2 file checks + 1 move
}
} else {
if toyFileChecks(r) != -1 {
assert.Equal(t, toyFileChecks(r), accounting.GlobalStats().GetChecks())
}
assert.Equal(t, toyFileTransfers(r), accounting.GlobalStats().GetTransfers())
}
}
func toyFileChecks(r *fstest.Run) int64 {
// As currently there is no Fs interface providing number of chunks
// in a file, this test depends on the well-known names of test remotes.
remote := r.Fremote.Name()
// Numbers below are calculated for a 14 byte file.
if !strings.HasPrefix(remote, "TestChunker") {
return 2
}
// Chunker makes more internal checks.
// Union remote can Move but returns CantMove error.
moveAsCopyDelete := r.Fremote.Features().Move == nil || remote == "TestUnion"
chunker := strings.HasPrefix(remote, "TestChunker")
wrappedMoveAsCopyDelete := chunker && strings.HasSuffix(remote, "S3")
chunk3b := chunker && strings.Contains(remote, "Chunk3b") // chunker with 3 byte chunks
chunk50b := chunker && strings.Contains(remote, "Chunk50b") // chunker with 50 byte chunks
chunkDefault := chunker && !strings.Contains(remote, "ChunkerChunk") // default big chunk size
chunkBig := chunk50b || chunkDefault // file is smaller than chunk size
// Verify number of checks for a toy 14 byte file.
// The order of cases matters!
var checks int
switch {
case strings.Contains(remote, "Chunk3b"): // chunk 3 bytes
checks = 6
case strings.Contains(remote, "Chunk50b"): // chunk 50 bytes
checks = 3
case strings.Contains(remote, "ChunkerChunk"): // unknown chunk size
return -1
case canTrackRenames && chunk3b:
checks = 8 // chunker makes extra checks for each small chunk
case canTrackRenames && chunkBig:
checks = 4 // chunker makes 1 extra check for a single big chunk
case canTrackRenames && moveAsCopyDelete:
checks = 4 // 2 file checks + 1 move + 1 delete
case canTrackRenames:
checks = 3 // 2 file checks + 1 move
case !chunker:
checks = 2 // 2 file checks on a generic non-chunking remote
case chunk3b:
checks = 6 // chunker makes extra checks for each small chunk
case chunkBig && wrappedMoveAsCopyDelete:
checks = 4 // one more extra check because S3 emulates Move as Copy+Delete
case chunkBig:
checks = 3 // chunker makes 1 extra check for a single big chunk
default:
checks = 3 // large chunks (eventually no chunking)
checks = -1 // skip verification for chunker with unknown chunk size
}
if strings.HasSuffix(remote, "S3") {
checks++ // Extra check because S3 emulates Move as Copy+Delete.
if checks != -1 { // "-1" allows remotes to bypass this check
assert.Equal(t, int64(checks), accounting.GlobalStats().GetChecks())
}
// Verify number of copy operations for a toy 14 byte file.
// The order of cases matters!
var copies int64
switch {
case canTrackRenames && moveAsCopyDelete:
copies = 1 // 1 copy
case canTrackRenames:
copies = 0 // 0 copy
case chunkBig && wrappedMoveAsCopyDelete:
copies = 2 // extra Copy because S3 emulates Move as Copy+Delete.
default:
copies = 1
}
if copies != -1 { // "-1" allows remotes to bypass this check
assert.Equal(t, copies, accounting.GlobalStats().GetTransfers())
}
return int64(checks)
}
func toyFileTransfers(r *fstest.Run) int64 {

View file

@ -33,9 +33,6 @@ backends:
- backend: "chunker"
remote: "TestChunkerNometaLocal:"
fastlist: true
- backend: "chunker"
remote: "TestChunkerCompatLocal:"
fastlist: true
- backend: "chunker"
remote: "TestChunkerChunk3bLocal:"
fastlist: true
@ -44,10 +41,6 @@ backends:
remote: "TestChunkerChunk3bNometaLocal:"
fastlist: true
maxfile: 6k
- backend: "chunker"
remote: "TestChunkerChunk3bCompatLocal:"
fastlist: true
maxfile: 6k
- backend: "chunker"
remote: "TestChunkerMailru:"
fastlist: true
@ -66,30 +59,26 @@ backends:
- backend: "chunker"
remote: "TestChunkerS3:"
fastlist: true
ignore:
- TestIntegration/FsMkdir/FsPutFiles/SetTier
- backend: "chunker"
remote: "TestChunkerChunk50bS3:"
fastlist: true
maxfile: 1k
ignore:
- TestIntegration/FsMkdir/FsPutFiles/SetTier
#- backend: "chunker"
# remote: "TestChunkerChunk50bMD5HashS3:"
# fastlist: true
# maxfile: 1k
#- backend: "chunker"
# remote: "TestChunkerChunk50bMD5QuickS3:"
# fastlist: true
# maxfile: 1k
#- backend: "chunker"
# remote: "TestChunkerChunk50bSHA1HashS3:"
# fastlist: true
# maxfile: 1k
#- backend: "chunker"
# remote: "TestChunkerChunk50bSHA1QuickS3:"
# fastlist: true
# maxfile: 1k
- backend: "chunker"
remote: "TestChunkerChunk50bMD5HashS3:"
fastlist: true
maxfile: 1k
- backend: "chunker"
remote: "TestChunkerChunk50bSHA1HashS3:"
fastlist: true
maxfile: 1k
- backend: "chunker"
remote: "TestChunkerChunk50bMD5QuickS3:"
fastlist: true
maxfile: 1k
- backend: "chunker"
remote: "TestChunkerChunk50bSHA1QuickS3:"
fastlist: true
maxfile: 1k
## end chunker
- backend: "drive"
remote: "TestDrive:"