chunker: finish meta-format before release
changes: - chunker: remove GetTier and SetTier - remove wdmrcompat metaformat - remove fastopen strategy - make hash_type option non-advanced - adverise hash support when possible - add metadata field "ver", run strict checks - describe internal behavior in comments - improve documentation note: wdmrcompat used to write file name in the metadata, so maximum metadata size was 1K; removing it allows to cap size by 200 bytes now.
This commit is contained in:
parent
c41812fc88
commit
ccecfa9cb1
5 changed files with 303 additions and 312 deletions
|
@ -36,13 +36,11 @@ const (
|
|||
// WARNING: this optimization is not transaction safe!
|
||||
optimizeFirstChunk = false
|
||||
|
||||
// Normally metadata is a small (less than 1KB) piece of JSON.
|
||||
// Normally metadata is a small (100-200 bytes) piece of JSON.
|
||||
// Valid metadata size should not exceed this limit.
|
||||
maxMetaDataSize = 1023
|
||||
maxMetaDataSize = 199
|
||||
|
||||
// fastopen strategy opens all chunks immediately, but reads sequentially.
|
||||
// linear strategy opens and reads chunks sequentially, without read-ahead.
|
||||
downloadStrategy = "linear"
|
||||
metaDataVersion = 1
|
||||
)
|
||||
|
||||
// Formatting of temporary chunk names. Temporary suffix *follows* chunk
|
||||
|
@ -52,6 +50,13 @@ var (
|
|||
tempChunkRegexp = regexp.MustCompile(`^(.+)\.\.tmp_([0-9]{10,19})$`)
|
||||
)
|
||||
|
||||
// Note: metadata logic is tightly coupled with chunker code in many
|
||||
// places of the code, eg. in checks whether a file can have meta object
|
||||
// or is eligible for chunking.
|
||||
// If more metadata formats (or versions of a format) are added in future,
|
||||
// it may be advisable to factor it into a "metadata strategy" interface
|
||||
// similar to chunkingReader or linearReader below.
|
||||
|
||||
// Register with Fs
|
||||
func init() {
|
||||
fs.Register(&fs.RegInfo{
|
||||
|
@ -98,16 +103,10 @@ Metadata is a small JSON file named after the composite file.`,
|
|||
Value: "simplejson",
|
||||
Help: `Simple JSON supports hash sums and chunk validation.
|
||||
It has the following fields: size, nchunks, md5, sha1.`,
|
||||
}, {
|
||||
Value: "wdmrcompat",
|
||||
Help: `This format brings compatibility with WebDavMailRuCloud.
|
||||
It does not support hash sums or validation, most fields are ignored.
|
||||
It has the following fields: Name, Size, PublicKey, CreationDate.
|
||||
Requires hash type "none".`,
|
||||
}},
|
||||
}, {
|
||||
Name: "hash_type",
|
||||
Advanced: true,
|
||||
Advanced: false,
|
||||
Default: "md5",
|
||||
Help: `Choose how chunker handles hash sums.`,
|
||||
Examples: []fs.OptionExample{{
|
||||
|
@ -122,8 +121,8 @@ for a single-chunk file but returns nothing otherwise.`,
|
|||
Help: `SHA1 for multi-chunk files. Requires "simplejson".`,
|
||||
}, {
|
||||
Value: "md5quick",
|
||||
Help: `When a file is copied on to chunker, MD5 is taken from its source
|
||||
falling back to SHA1 if the source doesn't support it. Requires "simplejson".`,
|
||||
Help: `Copying a file to chunker will request MD5 from the source
|
||||
falling back to SHA1 if unsupported. Requires "simplejson".`,
|
||||
}, {
|
||||
Value: "sha1quick",
|
||||
Help: `Similar to "md5quick" but prefers SHA1 over MD5. Requires "simplejson".`,
|
||||
|
@ -188,7 +187,7 @@ func NewFs(name, rpath string, m configmap.Mapper) (fs.Fs, error) {
|
|||
switch opt.MetaFormat {
|
||||
case "none":
|
||||
f.useMeta = false
|
||||
case "simplejson", "wdmrcompat":
|
||||
case "simplejson":
|
||||
f.useMeta = true
|
||||
default:
|
||||
return nil, fmt.Errorf("unsupported meta format '%s'", opt.MetaFormat)
|
||||
|
@ -243,8 +242,6 @@ func NewFs(name, rpath string, m configmap.Mapper) (fs.Fs, error) {
|
|||
WriteMimeType: true,
|
||||
BucketBased: true,
|
||||
CanHaveEmptyDirectories: true,
|
||||
SetTier: true,
|
||||
GetTier: true,
|
||||
ServerSideAcrossConfigs: true,
|
||||
}).Fill(f).Mask(baseFs).WrapsFs(f, baseFs)
|
||||
|
||||
|
@ -393,6 +390,19 @@ func (f *Fs) parseChunkName(name string) (mainName string, chunkNo int, tempNo i
|
|||
//
|
||||
// This should return ErrDirNotFound if the directory isn't
|
||||
// found.
|
||||
//
|
||||
// Commands normally cleanup all temporary chunks in case of a failure.
|
||||
// However, if rclone dies unexpectedly, it can leave behind a bunch of
|
||||
// hidden temporary chunks. List and its underlying chunkEntries()
|
||||
// silently skip all temporary chunks in the directory. It's okay if
|
||||
// they belong to an unfinished command running in parallel.
|
||||
//
|
||||
// However, there is no way to discover dead temporary chunks a.t.m.
|
||||
// As a workaround users can use `purge` to forcibly remove the whole
|
||||
// directory together with dead chunks.
|
||||
// In future a flag named like `--chunker-list-hidden` may be added to
|
||||
// rclone that will tell List to reveal hidden chunks.
|
||||
//
|
||||
func (f *Fs) List(ctx context.Context, dir string) (entries fs.DirEntries, err error) {
|
||||
entries, err = f.base.List(ctx, dir)
|
||||
if err != nil {
|
||||
|
@ -428,7 +438,8 @@ func (f *Fs) ListR(ctx context.Context, dir string, callback fs.ListRCallback) (
|
|||
})
|
||||
}
|
||||
|
||||
// Add some directory entries. This alters entries returning it as newEntries.
|
||||
// chunkEntries is called by List(R). It merges chunk entries from
|
||||
// wrapped remote into composite directory entries.
|
||||
func (f *Fs) chunkEntries(ctx context.Context, origEntries fs.DirEntries, hardErrors bool) (chunkedEntries fs.DirEntries, err error) {
|
||||
// sort entries, so that meta objects (if any) appear before their chunks
|
||||
sortedEntries := make(fs.DirEntries, len(origEntries))
|
||||
|
@ -514,6 +525,11 @@ func (f *Fs) chunkEntries(ctx context.Context, origEntries fs.DirEntries, hardEr
|
|||
}
|
||||
|
||||
// NewObject finds the Object at remote.
|
||||
//
|
||||
// Please note that every NewObject invocation will scan the whole directory.
|
||||
// Using here something like fs.DirCache might improve performance (and make
|
||||
// logic more complex though).
|
||||
//
|
||||
func (f *Fs) NewObject(ctx context.Context, remote string) (fs.Object, error) {
|
||||
if mainRemote, _, _ := f.parseChunkName(remote); mainRemote != "" {
|
||||
return nil, fmt.Errorf("%q should be meta object, not a chunk", remote)
|
||||
|
@ -622,23 +638,14 @@ func (o *Object) readMetaData(ctx context.Context) error {
|
|||
case "simplejson":
|
||||
metaInfo, err := unmarshalSimpleJSON(ctx, metaObject, metaData)
|
||||
if err != nil {
|
||||
// TODO: maybe it's a small single chunk?
|
||||
return err
|
||||
// TODO: in a rare case we might mistake a small file for metadata
|
||||
return errors.Wrap(err, "invalid metadata")
|
||||
}
|
||||
if o.size != metaInfo.Size() || len(o.chunks) != metaInfo.nChunks {
|
||||
return errors.New("invalid simplejson metadata")
|
||||
return errors.New("metadata doesn't match file size")
|
||||
}
|
||||
o.md5 = metaInfo.md5
|
||||
o.sha1 = metaInfo.sha1
|
||||
case "wdmrcompat":
|
||||
metaInfo, err := unmarshalWDMRCompat(ctx, metaObject, metaData)
|
||||
if err != nil {
|
||||
// TODO: maybe it's a small single chunk?
|
||||
return err
|
||||
}
|
||||
if o.size != metaInfo.Size() {
|
||||
return errors.New("invalid wdmrcompat metadata")
|
||||
}
|
||||
}
|
||||
|
||||
o.isFull = true
|
||||
|
@ -784,9 +791,6 @@ func (f *Fs) put(ctx context.Context, in io.Reader, src fs.ObjectInfo, remote st
|
|||
case "simplejson":
|
||||
c.updateHashes()
|
||||
metaData, err = marshalSimpleJSON(ctx, sizeTotal, len(c.chunks), c.md5, c.sha1)
|
||||
case "wdmrcompat":
|
||||
fileInfo := f.wrapInfo(src, baseRemote, sizeTotal)
|
||||
metaData, err = marshalWDMRCompat(ctx, fileInfo)
|
||||
}
|
||||
if err == nil {
|
||||
metaInfo := f.wrapInfo(src, baseRemote, int64(len(metaData)))
|
||||
|
@ -951,6 +955,9 @@ func (f *Fs) PutStream(ctx context.Context, in io.Reader, src fs.ObjectInfo, opt
|
|||
|
||||
// Update in to the object with the modTime given of the given size
|
||||
func (o *Object) Update(ctx context.Context, in io.Reader, src fs.ObjectInfo, options ...fs.OpenOption) error {
|
||||
if err := o.readMetaData(ctx); err != nil {
|
||||
return err
|
||||
}
|
||||
basePut := o.f.base.Put
|
||||
if src.Size() < 0 {
|
||||
basePut = o.f.base.Features().PutStream
|
||||
|
@ -989,8 +996,17 @@ func (f *Fs) Precision() time.Duration {
|
|||
}
|
||||
|
||||
// Hashes returns the supported hash sets.
|
||||
// Chunker advertises a hash type if and only if it can be calculated
|
||||
// for files of any size, multi-chunked or small.
|
||||
func (f *Fs) Hashes() hash.Set {
|
||||
return hash.Set(hash.None)
|
||||
// composites && all of them && small files supported by wrapped remote
|
||||
if f.useMD5 && !f.quickHash && f.base.Hashes().Contains(hash.MD5) {
|
||||
return hash.NewHashSet(hash.MD5)
|
||||
}
|
||||
if f.useSHA1 && !f.quickHash && f.base.Hashes().Contains(hash.SHA1) {
|
||||
return hash.NewHashSet(hash.SHA1)
|
||||
}
|
||||
return hash.NewHashSet() // can't provide strong guarantees
|
||||
}
|
||||
|
||||
// Mkdir makes the directory (container, bucket)
|
||||
|
@ -1012,7 +1028,12 @@ func (f *Fs) Rmdir(ctx context.Context, dir string) error {
|
|||
// Implement this if you have a way of deleting all the files
|
||||
// quicker than just running Remove() on the result of List()
|
||||
//
|
||||
// Return an error if it doesn't exist
|
||||
// Return an error if it doesn't exist.
|
||||
//
|
||||
// This command will chain to `purge` from wrapped remote.
|
||||
// As a result it removes not only chunker files with their
|
||||
// active chunks but also all hidden chunks in the directory.
|
||||
//
|
||||
func (f *Fs) Purge(ctx context.Context) error {
|
||||
do := f.base.Features().Purge
|
||||
if do == nil {
|
||||
|
@ -1021,7 +1042,25 @@ func (f *Fs) Purge(ctx context.Context) error {
|
|||
return do(ctx)
|
||||
}
|
||||
|
||||
// Remove an object
|
||||
// Remove an object (chunks and metadata, if any)
|
||||
//
|
||||
// Remove deletes only active chunks of the object.
|
||||
// It does not try to look for temporary chunks because they could belong
|
||||
// to another command modifying this composite file in parallel.
|
||||
//
|
||||
// Commands normally cleanup all temporary chunks in case of a failure.
|
||||
// However, if rclone dies unexpectedly, it can leave hidden temporary
|
||||
// chunks, which cannot be discovered using the `list` command.
|
||||
// Remove does not try to search for such chunks or delete them.
|
||||
// Sometimes this can lead to strange results eg. when `list` shows that
|
||||
// directory is empty but `rmdir` refuses to remove it because on the
|
||||
// level of wrapped remote it's actually *not* empty.
|
||||
// As a workaround users can use `purge` to forcibly remove it.
|
||||
//
|
||||
// In future, a flag `--chunker-delete-hidden` may be added which tells
|
||||
// Remove to search directory for hidden chunks and remove them too
|
||||
// (at the risk of breaking parallel commands).
|
||||
//
|
||||
func (o *Object) Remove(ctx context.Context) (err error) {
|
||||
if o.main != nil {
|
||||
err = o.main.Remove(ctx)
|
||||
|
@ -1095,13 +1134,6 @@ func (f *Fs) copyOrMove(ctx context.Context, o *Object, remote string, do copyMo
|
|||
metaInfo := f.wrapInfo(metaObject, "", int64(len(metaData)))
|
||||
err = newObj.main.Update(ctx, bytes.NewReader(metaData), metaInfo)
|
||||
}
|
||||
case "wdmrcompat":
|
||||
newInfo := f.wrapInfo(metaObject, "", newObj.size)
|
||||
metaData, err = marshalWDMRCompat(ctx, newInfo)
|
||||
if err == nil {
|
||||
metaInfo := f.wrapInfo(metaObject, "", int64(len(metaData)))
|
||||
err = newObj.main.Update(ctx, bytes.NewReader(metaData), metaInfo)
|
||||
}
|
||||
case "none":
|
||||
if newObj.main != nil {
|
||||
err = newObj.main.Remove(ctx)
|
||||
|
@ -1436,7 +1468,22 @@ func (o *Object) SetModTime(ctx context.Context, mtime time.Time) error {
|
|||
|
||||
// Hash returns the selected checksum of the file.
|
||||
// If no checksum is available it returns "".
|
||||
// It prefers the wrapped hashsum for a non-chunked file, then tries saved one.
|
||||
//
|
||||
// Hash prefers wrapped hashsum for a non-chunked file, then tries to
|
||||
// read it from metadata. This in theory handles an unusual case when
|
||||
// a small file is modified on the lower level by wrapped remote
|
||||
// but chunker is not yet aware of changes.
|
||||
//
|
||||
// Currently metadata (if not configured as 'none') is kept only for
|
||||
// multi-chunk files, but for small files chunker obtains hashsums from
|
||||
// wrapped remote. If a particular hashsum type is not supported,
|
||||
// chunker won't fail with `unsupported` error but return empty hash.
|
||||
//
|
||||
// In future metadata logic can be extended: if a normal (non-quick)
|
||||
// hash type is configured, chunker will check whether wrapped remote
|
||||
// supports it (see Fs.Hashes as an example). If not, it will add metadata
|
||||
// to small files as well, thus providing hashsums for all files.
|
||||
//
|
||||
func (o *Object) Hash(ctx context.Context, hashType hash.Type) (string, error) {
|
||||
if !o.isChunked() {
|
||||
// First, chain to the single wrapped chunk, if possible.
|
||||
|
@ -1500,78 +1547,10 @@ func (o *Object) Open(ctx context.Context, options ...fs.OpenOption) (rc io.Read
|
|||
limit = o.size - offset
|
||||
}
|
||||
|
||||
switch downloadStrategy {
|
||||
case "linear":
|
||||
return o.newLinearReader(ctx, offset, limit, openOptions)
|
||||
case "fastopen":
|
||||
return o.newFastopenReader(ctx, offset, limit, openOptions)
|
||||
default:
|
||||
return nil, errors.New("invalid download strategy")
|
||||
}
|
||||
}
|
||||
|
||||
// fastopenReader opens all chunks immediately, but reads sequentlially
|
||||
type fastopenReader struct {
|
||||
readClosers []io.ReadCloser
|
||||
multiReader io.Reader
|
||||
}
|
||||
|
||||
func (o *Object) newFastopenReader(ctx context.Context, offset, limit int64, options []fs.OpenOption) (io.ReadCloser, error) {
|
||||
var (
|
||||
readers []io.Reader
|
||||
readClosers []io.ReadCloser
|
||||
)
|
||||
for _, chunk := range o.chunks {
|
||||
if limit <= 0 {
|
||||
break
|
||||
}
|
||||
count := chunk.Size()
|
||||
if offset >= count {
|
||||
offset -= count
|
||||
continue
|
||||
}
|
||||
count -= offset
|
||||
if limit < count {
|
||||
count = limit
|
||||
}
|
||||
|
||||
end := offset + count - 1
|
||||
chunkOptions := append(options, &fs.RangeOption{Start: offset, End: end})
|
||||
rc, err := chunk.Open(ctx, chunkOptions...)
|
||||
if err != nil {
|
||||
r := fastopenReader{readClosers: readClosers}
|
||||
_ = r.Close() // ignore error
|
||||
return nil, err
|
||||
}
|
||||
readClosers = append(readClosers, rc)
|
||||
readers = append(readers, rc)
|
||||
|
||||
offset = 0
|
||||
limit -= count
|
||||
}
|
||||
|
||||
r := &fastopenReader{
|
||||
readClosers: readClosers,
|
||||
multiReader: io.MultiReader(readers...),
|
||||
}
|
||||
return r, nil
|
||||
}
|
||||
|
||||
func (r *fastopenReader) Read(p []byte) (n int, err error) {
|
||||
return r.multiReader.Read(p)
|
||||
}
|
||||
|
||||
func (r *fastopenReader) Close() (err error) {
|
||||
for _, rc := range r.readClosers {
|
||||
chunkErr := rc.Close()
|
||||
if err == nil {
|
||||
err = chunkErr
|
||||
}
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
// linearReader opens and reads chunks sequentially, without read-ahead
|
||||
// linearReader opens and reads file chunks sequentially, without read-ahead
|
||||
type linearReader struct {
|
||||
ctx context.Context
|
||||
chunks []fs.Object
|
||||
|
@ -1771,25 +1750,9 @@ func (o *Object) ID() string {
|
|||
return ""
|
||||
}
|
||||
|
||||
// SetTier performs changing storage tier of the Object if
|
||||
// multiple storage classes supported
|
||||
func (o *Object) SetTier(tier string) error {
|
||||
if doer, ok := o.mainChunk().(fs.SetTierer); ok {
|
||||
return doer.SetTier(tier)
|
||||
}
|
||||
return errors.New("chunker: wrapped remote does not support SetTier")
|
||||
}
|
||||
|
||||
// GetTier returns storage tier or class of the Object
|
||||
func (o *Object) GetTier() string {
|
||||
if doer, ok := o.mainChunk().(fs.GetTierer); ok {
|
||||
return doer.GetTier()
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
// Meta format `simplejson`
|
||||
type metaSimpleJSON struct {
|
||||
Version int `json:"ver"`
|
||||
Size int64 `json:"size"`
|
||||
NChunks int `json:"nchunks"`
|
||||
MD5 string `json:"md5"`
|
||||
|
@ -1798,6 +1761,7 @@ type metaSimpleJSON struct {
|
|||
|
||||
func marshalSimpleJSON(ctx context.Context, size int64, nChunks int, md5, sha1 string) (data []byte, err error) {
|
||||
metaData := &metaSimpleJSON{
|
||||
Version: metaDataVersion,
|
||||
Size: size,
|
||||
NChunks: nChunks,
|
||||
MD5: md5,
|
||||
|
@ -1806,47 +1770,56 @@ func marshalSimpleJSON(ctx context.Context, size int64, nChunks int, md5, sha1 s
|
|||
return json.Marshal(&metaData)
|
||||
}
|
||||
|
||||
// Note: only metadata format version 1 is supported a.t.m.
|
||||
//
|
||||
// Current implementation creates metadata only for files larger than
|
||||
// configured chunk size. This approach has drawback: availability of
|
||||
// configured hashsum type for small files depends on the wrapped remote.
|
||||
// Future versions of chunker may change approach as described in comment
|
||||
// to the Hash method. They can transparently migrate older metadata.
|
||||
// New format will have a higher version number and cannot be correctly
|
||||
// hanled by current implementation.
|
||||
// The version check below will then explicitly ask user to upgrade rclone.
|
||||
//
|
||||
func unmarshalSimpleJSON(ctx context.Context, metaObject fs.Object, data []byte) (info *ObjectInfo, err error) {
|
||||
var metaData *metaSimpleJSON
|
||||
err = json.Unmarshal(data, &metaData)
|
||||
if err != nil {
|
||||
return
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// Perform strict checks, avoid corruption of future metadata formats.
|
||||
if metaData.Size < 0 {
|
||||
return nil, errors.New("negative file size")
|
||||
}
|
||||
if metaData.NChunks <= 0 {
|
||||
return nil, errors.New("wrong number of chunks")
|
||||
}
|
||||
if metaData.MD5 != "" {
|
||||
_, err = hex.DecodeString(metaData.MD5)
|
||||
if len(metaData.MD5) != 32 || err != nil {
|
||||
return nil, errors.New("wrong md5 hash")
|
||||
}
|
||||
}
|
||||
if metaData.SHA1 != "" {
|
||||
_, err = hex.DecodeString(metaData.SHA1)
|
||||
if len(metaData.SHA1) != 40 || err != nil {
|
||||
return nil, errors.New("wrong sha1 hash")
|
||||
}
|
||||
}
|
||||
if metaData.Version <= 0 {
|
||||
return nil, errors.New("wrong version number")
|
||||
}
|
||||
if metaData.Version != metaDataVersion {
|
||||
return nil, errors.Errorf("version %d is not supported, please upgrade rclone", metaData.Version)
|
||||
}
|
||||
|
||||
var nilFs *Fs // nil object triggers appropriate type method
|
||||
info = nilFs.wrapInfo(metaObject, "", metaData.Size)
|
||||
info.md5 = metaData.MD5
|
||||
info.sha1 = metaData.SHA1
|
||||
info.nChunks = metaData.NChunks
|
||||
return
|
||||
}
|
||||
|
||||
// Meta format `wdmrcompat`
|
||||
type metaWDMRCompat struct {
|
||||
Name string `json:"Name"`
|
||||
Size int64 `json:"Size"`
|
||||
PublicKey interface{} `json:"PublicKey"` // ignored, can be nil
|
||||
CreationDate time.Time `json:"CreationDate"` // modification time, ignored
|
||||
}
|
||||
|
||||
func marshalWDMRCompat(ctx context.Context, srcInfo fs.ObjectInfo) (data []byte, err error) {
|
||||
metaData := &metaWDMRCompat{
|
||||
Name: path.Base(srcInfo.Remote()),
|
||||
Size: srcInfo.Size(),
|
||||
PublicKey: nil,
|
||||
CreationDate: srcInfo.ModTime(ctx).UTC(),
|
||||
}
|
||||
return json.Marshal(&metaData)
|
||||
}
|
||||
|
||||
func unmarshalWDMRCompat(ctx context.Context, metaObject fs.Object, data []byte) (info *ObjectInfo, err error) {
|
||||
var metaData *metaWDMRCompat
|
||||
err = json.Unmarshal(data, &metaData)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
var nilFs *Fs // nil object triggers appropriate type method
|
||||
info = nilFs.wrapInfo(metaObject, "", metaData.Size)
|
||||
return
|
||||
return info, nil
|
||||
}
|
||||
|
||||
// Check the interfaces are satisfied
|
||||
|
@ -1868,6 +1841,4 @@ var (
|
|||
_ fs.Object = (*Object)(nil)
|
||||
_ fs.ObjectUnWrapper = (*Object)(nil)
|
||||
_ fs.IDer = (*Object)(nil)
|
||||
_ fs.SetTierer = (*Object)(nil)
|
||||
_ fs.GetTierer = (*Object)(nil)
|
||||
)
|
||||
|
|
|
@ -31,7 +31,11 @@ func TestIntegration(t *testing.T) {
|
|||
RemoteName: *fstest.RemoteName,
|
||||
NilObject: (*chunker.Object)(nil),
|
||||
SkipBadWindowsCharacters: !*UseBadChars,
|
||||
UnimplementableObjectMethods: []string{"MimeType"},
|
||||
UnimplementableObjectMethods: []string{
|
||||
"MimeType",
|
||||
"GetTier",
|
||||
"SetTier",
|
||||
},
|
||||
UnimplementableFsMethods: []string{
|
||||
"PublicLink",
|
||||
"OpenWriterAt",
|
||||
|
|
|
@ -4,11 +4,11 @@ description: "Split-chunking overlay remote"
|
|||
date: "2019-08-30"
|
||||
---
|
||||
|
||||
<i class="fa fa-cut"></i>Chunker
|
||||
<i class="fa fa-cut"></i>Chunker (BETA)
|
||||
----------------------------------------
|
||||
|
||||
The `chunker` overlay transparently splits large files into smaller chunks
|
||||
during the upload to wrapped remote and transparently assembles them back
|
||||
during upload to wrapped remote and transparently assembles them back
|
||||
when the file is downloaded. This allows to effectively overcome size limits
|
||||
imposed by storage providers.
|
||||
|
||||
|
@ -41,10 +41,27 @@ Storage> chunker
|
|||
Remote to chunk/unchunk.
|
||||
Normally should contain a ':' and a path, eg "myremote:path/to/dir",
|
||||
"myremote:bucket" or maybe "myremote:" (not recommended).
|
||||
Enter a string value. Press Enter for the default ("").
|
||||
remote> remote:path
|
||||
Files larger than chunk_size will be split in chunks. By default 2 Gb.
|
||||
Files larger than chunk size will be split in chunks.
|
||||
Enter a size with suffix k,M,G,T. Press Enter for the default ("2G").
|
||||
chunk_size> 1G
|
||||
chunk_size> 100M
|
||||
Choose how chunker handles hash sums.
|
||||
Enter a string value. Press Enter for the default ("md5").
|
||||
Choose a number from below, or type in your own value
|
||||
/ Chunker can pass any hash supported by wrapped remote
|
||||
1 | for a single-chunk file but returns nothing otherwise.
|
||||
\ "none"
|
||||
2 / MD5 for multi-chunk files. Requires "simplejson".
|
||||
\ "md5"
|
||||
3 / SHA1 for multi-chunk files. Requires "simplejson".
|
||||
\ "sha1"
|
||||
/ Copying a file to chunker will request MD5 from the source
|
||||
4 | falling back to SHA1 if unsupported. Requires "simplejson".
|
||||
\ "md5quick"
|
||||
5 / Similar to "md5quick" but prefers SHA1 over MD5. Requires "simplejson".
|
||||
\ "sha1quick"
|
||||
hash_type> md5
|
||||
Edit advanced config? (y/n)
|
||||
y) Yes
|
||||
n) No
|
||||
|
@ -53,8 +70,9 @@ Remote config
|
|||
--------------------
|
||||
[overlay]
|
||||
type = chunker
|
||||
remote = TestLocal:
|
||||
chunk_size = 2G
|
||||
remote = remote:bucket
|
||||
chunk_size = 100M
|
||||
hash_type = md5
|
||||
--------------------
|
||||
y) Yes this is OK
|
||||
e) Edit this remote
|
||||
|
@ -73,8 +91,8 @@ will put files in a directory called `name` in the current directory.
|
|||
|
||||
### Chunking
|
||||
|
||||
When rclone starts a file upload, chunker checks the file size.
|
||||
If it doesn't exceed the configured chunk size, chunker will just pass it
|
||||
When rclone starts a file upload, chunker checks the file size. If it
|
||||
doesn't exceed the configured chunk size, chunker will just pass the file
|
||||
to the wrapped remote. If a file is large, chunker will transparently cut
|
||||
data in pieces with temporary names and stream them one by one, on the fly.
|
||||
Each chunk will contain the specified number of data byts, except for the
|
||||
|
@ -84,7 +102,7 @@ a temporary copy, record its size and repeat the above process.
|
|||
When upload completes, temporary chunk files are finally renamed.
|
||||
This scheme guarantees that operations look from outside as atomic.
|
||||
A similar method with hidden temporary chunks is used for other operations
|
||||
(copy/move/rename etc). If operation fails, hidden chunks are normally
|
||||
(copy/move/rename etc). If an operation fails, hidden chunks are normally
|
||||
destroyed, and the destination composite file stays intact.
|
||||
|
||||
#### Chunk names
|
||||
|
@ -94,58 +112,52 @@ By default chunk names are `BIG_FILE_NAME.rclone-chunk.001`,
|
|||
format is `*.rclone-chunk.###`. You can configure another name format
|
||||
using the `--chunker-name-format` option. The format uses asterisk
|
||||
`*` as a placeholder for the base file name and one or more consecutive
|
||||
hash characters `#` as a placeholder for the chunk number. There must be
|
||||
one and only one asterisk. The number of consecutive hashes defines the
|
||||
minimum length of a string representing a chunk number. If a chunk number
|
||||
has less digits than the number of hashes, it is left-padded by zeros.
|
||||
If there are more digits in the number, they are left as is.
|
||||
hash characters `#` as a placeholder for sequential chunk number.
|
||||
There must be one and only one asterisk. The number of consecutive hash
|
||||
characters defines the minimum length of a string representing a chunk number.
|
||||
If decimal chunk number has less digits than the number of hashes, it is
|
||||
left-padded by zeros. If the number stringis longer, it is left intact.
|
||||
By default numbering starts from 1 but there is another option that allows
|
||||
user to start from 0, eg. for compatibility with legacy software.
|
||||
|
||||
For example, if name format is `big_*-##.part`, and original file was
|
||||
named `data.txt` and numbering starts from 0, then the first chunk will be
|
||||
named `big_data.txt-00.part`, the 99th chunk will be `big_data.txt-98.part`
|
||||
and the 302nd chunk will be `big_data.txt-301.part`.
|
||||
For example, if name format is `big_*-##.part` and original file name is
|
||||
`data.txt` and numbering starts from 0, then the first chunk will be named
|
||||
`big_data.txt-00.part`, the 99th chunk will be `big_data.txt-98.part`
|
||||
and the 302nd chunk will become `big_data.txt-301.part`.
|
||||
|
||||
Would-be chunk files are ignored if their name does not match given format.
|
||||
The list command might encounter composite files with missinng or invalid
|
||||
chunks. By default, if chunker detects a missing chunk it will silently
|
||||
ignore the whole group. Use the `--chunker-fail-on-bad-chunks` flag
|
||||
to make it fail with an error message.
|
||||
When the `list` rclone command scans a directory on wrapped remote, the
|
||||
potential chunk files are accounted for and merged into composite directory
|
||||
entries only if their names match the configured format. All other files
|
||||
are ignored, including temporary chunks.
|
||||
The list command might encounter composite files with missing or invalid
|
||||
chunks. If chunker detects a missing chunk it will by default silently
|
||||
ignore the whole group. You can use the `--chunker-fail-on-bad-chunks`
|
||||
command line flag to make `list` fail with an error message.
|
||||
|
||||
|
||||
### Metadata
|
||||
|
||||
By default when a file is large enough, chunker will create a metadata
|
||||
object besides data chunks. The object is named after the original file.
|
||||
Chunker allows to choose between few metadata formats. Please note that
|
||||
currently metadata is not created for files smaller than configured
|
||||
chunk size. This may change in future as new formats are developed.
|
||||
Chunker allows user to disable metadata completely (the `none` format).
|
||||
Please note that currently metadata is not created for files smaller
|
||||
than configured chunk size. This may change in future as new formats
|
||||
are developed.
|
||||
|
||||
#### Simple JSON metadata format
|
||||
|
||||
This is the default format. It supports hash sums and chunk validation
|
||||
for composite files. Meta objects carry the following fields:
|
||||
|
||||
- `size` - total size of chunks
|
||||
- `nchunks` - number of chunks
|
||||
- `md5` - MD5 hashsum (if present)
|
||||
- `ver` - version of format, currently `1`
|
||||
- `size` - total size of composite file
|
||||
- `nchunks` - number of chunks in the file
|
||||
- `md5` - MD5 hashsum of composite file (if present)
|
||||
- `sha1` - SHA1 hashsum (if present)
|
||||
|
||||
There is no field for composite file name as it's simply equal to the name
|
||||
of meta object on the wrapped remote. Please refer to respective sections
|
||||
for detils on hashsums and modified time handling.
|
||||
|
||||
#### WedDavMailRu compatible metadata format
|
||||
|
||||
The `wdmrcompat` metadata format is only useful to support historical files
|
||||
created by [WebDriveMailru](https://github.com/yar229/WebDavMailRuCloud).
|
||||
It keeps the following fields (most are ignored, though):
|
||||
|
||||
- `Name` - name of the composite file (always equal to the meta file name)
|
||||
- `Size` - total size of chunks
|
||||
- `PublicKey` - ignored, always "null"
|
||||
- `CreationDate` - last modification (sic!) time, ignored.
|
||||
for detils on hashsums and handling of modified time.
|
||||
|
||||
#### No metadata
|
||||
|
||||
|
@ -161,8 +173,8 @@ errors (especially missing last chunk) than metadata-enabled formats.
|
|||
### Hashsums
|
||||
|
||||
Chunker supports hashsums only when a compatible metadata is present.
|
||||
Thus, if you choose metadata format of `none` or `wdmrcompat`, chunker
|
||||
will return `UNSUPPORTED` as hashsum.
|
||||
Thus, if you choose metadata format of `none`, chunker will return
|
||||
`UNSUPPORTED` as hashsum.
|
||||
|
||||
Please note that metadata is stored only for composite files. If a file
|
||||
is small (smaller than configured chunk size), chunker will transparently
|
||||
|
@ -175,16 +187,16 @@ Currently you can choose one or another but not both.
|
|||
MD5 is set by default as the most supported type.
|
||||
Since chunker keeps hashes for composite files and falls back to the
|
||||
wrapped remote hash for small ones, we advise you to choose the same
|
||||
hash type as wrapped remote, so your file listings look coherent.
|
||||
hash type as wrapped remote so that your file listings look coherent.
|
||||
|
||||
Normally, when a file is copied to chunker controlled remote, chunker
|
||||
will ask its source for compatible file hash and revert to on-the-fly
|
||||
Normally, when a file is copied to a chunker controlled remote, chunker
|
||||
will ask the file source for compatible file hash and revert to on-the-fly
|
||||
calculation if none is found. This involves some CPU overhead but provides
|
||||
a guarantee that given hashsum is available. Also, chunker will reject
|
||||
a server-side copy or move operation if source and destination hashsum
|
||||
types are different, resulting in the extra network bandwidth, too.
|
||||
In some rare cases this may be undesired, so chunker provides two optional
|
||||
choices: `sha1quick` and `md5quick`. If source does not have the primary
|
||||
choices: `sha1quick` and `md5quick`. If the source does not support primary
|
||||
hash type and the quick mode is enabled, chunker will try to fall back to
|
||||
the secondary type. This will save CPU and bandwidth but can result in empty
|
||||
hashsums at destination. Beware of consequences: the `sync` command will
|
||||
|
@ -215,13 +227,14 @@ chunk naming scheme is to:
|
|||
hash type, chunk naming etc.
|
||||
- Now run `rclone sync oldchunks: newchunks:` and all your data
|
||||
will be transparently converted at transfer.
|
||||
This may take some time.
|
||||
This may take some time, yet chunker will try server-side
|
||||
copy if possible.
|
||||
- After checking data integrity you may remove configuration section
|
||||
of the old remote.
|
||||
|
||||
If rclone gets killed during a long operation on a big composite file,
|
||||
hidden temporary chunks may stay in the directory. They will not be
|
||||
shown by the list command but will eat up your account quota.
|
||||
shown by the `list` command but will eat up your account quota.
|
||||
Please note that the `deletefile` rclone command deletes only active
|
||||
chunks of a file. As a workaround, you can use remote of the wrapped
|
||||
file system to see them.
|
||||
|
@ -234,17 +247,18 @@ remove everything including garbage.
|
|||
### Caveats and Limitations
|
||||
|
||||
Chunker requires wrapped remote to support server side `move` (or `copy` +
|
||||
delete) operations, otherwise it will explicitly refuse to start.
|
||||
`delete`) operations, otherwise it will explicitly refuse to start.
|
||||
This is because it internally renames temporary chunk files to their final
|
||||
names when an operation completes successfully.
|
||||
|
||||
Note that moves done using the copy-and-delete method may incur double
|
||||
charging with some cloud storage providers.
|
||||
Note that a move implemented using the copy-and-delete method may incur
|
||||
double charging with some cloud storage providers.
|
||||
|
||||
Chunker will not automatically rename existing chunks when you change the
|
||||
chunk name format. Beware that in result of this some files which have been
|
||||
treated as chunks before the change can pop up in directory listings as
|
||||
normal files and vice versa. The same warning holds for the chunk size.
|
||||
Chunker will not automatically rename existing chunks when you run
|
||||
`rclone config` on a live remote and change the chunk name format.
|
||||
Beware that in result of this some files which have been treated as chunks
|
||||
before the change can pop up in directory listings as normal files
|
||||
and vice versa. The same warning holds for the chunk size.
|
||||
If you desperately need to change critical chunking setings, you should
|
||||
run data migration as described in a dedicated section.
|
||||
|
||||
|
@ -278,6 +292,28 @@ Files larger than chunk size will be split in chunks.
|
|||
- Type: SizeSuffix
|
||||
- Default: 2G
|
||||
|
||||
#### --chunker-hash-type
|
||||
|
||||
Choose how chunker handles hash sums.
|
||||
|
||||
- Config: hash_type
|
||||
- Env Var: RCLONE_CHUNKER_HASH_TYPE
|
||||
- Type: string
|
||||
- Default: "md5"
|
||||
- Examples:
|
||||
- "none"
|
||||
- Chunker can pass any hash supported by wrapped remote
|
||||
- for a single-chunk file but returns nothing otherwise.
|
||||
- "md5"
|
||||
- MD5 for multi-chunk files. Requires "simplejson".
|
||||
- "sha1"
|
||||
- SHA1 for multi-chunk files. Requires "simplejson".
|
||||
- "md5quick"
|
||||
- Copying a file to chunker will request MD5 from the source
|
||||
- falling back to SHA1 if unsupported. Requires "simplejson".
|
||||
- "sha1quick"
|
||||
- Similar to "md5quick" but prefers SHA1 over MD5. Requires "simplejson".
|
||||
|
||||
### Advanced Options
|
||||
|
||||
Here are the advanced options specific to chunker (Transparently chunk/split large files).
|
||||
|
@ -321,33 +357,6 @@ Metadata is a small JSON file named after the composite file.
|
|||
- "simplejson"
|
||||
- Simple JSON supports hash sums and chunk validation.
|
||||
- It has the following fields: size, nchunks, md5, sha1.
|
||||
- "wdmrcompat"
|
||||
- This format brings compatibility with WebDavMailRuCloud.
|
||||
- It does not support hash sums or validation, most fields are ignored.
|
||||
- It has the following fields: Name, Size, PublicKey, CreationDate.
|
||||
- Requires hash type "none".
|
||||
|
||||
#### --chunker-hash-type
|
||||
|
||||
Choose how chunker handles hash sums.
|
||||
|
||||
- Config: hash_type
|
||||
- Env Var: RCLONE_CHUNKER_HASH_TYPE
|
||||
- Type: string
|
||||
- Default: "md5"
|
||||
- Examples:
|
||||
- "none"
|
||||
- Chunker can pass any hash supported by wrapped remote
|
||||
- for a single-chunk file but returns nothing otherwise.
|
||||
- "md5"
|
||||
- MD5 for multi-chunk files. Requires "simplejson".
|
||||
- "sha1"
|
||||
- SHA1 for multi-chunk files. Requires "simplejson".
|
||||
- "md5quick"
|
||||
- When a file is copied on to chunker, MD5 is taken from its source
|
||||
- falling back to SHA1 if the source doesn't support it. Requires "simplejson".
|
||||
- "sha1quick"
|
||||
- Similar to "md5quick" but prefers SHA1 over MD5. Requires "simplejson".
|
||||
|
||||
#### --chunker-fail-on-bad-chunks
|
||||
|
||||
|
|
|
@ -986,7 +986,6 @@ func TestSyncWithTrackRenames(t *testing.T) {
|
|||
fs.Config.TrackRenames = true
|
||||
defer func() {
|
||||
fs.Config.TrackRenames = false
|
||||
|
||||
}()
|
||||
|
||||
haveHash := r.Fremote.Hashes().Overlap(r.Flocal.Hashes()).GetOne() != hash.None
|
||||
|
@ -1010,45 +1009,64 @@ func TestSyncWithTrackRenames(t *testing.T) {
|
|||
|
||||
fstest.CheckItems(t, r.Fremote, f1, f2)
|
||||
|
||||
if canTrackRenames {
|
||||
if r.Fremote.Features().Move == nil || r.Fremote.Name() == "TestUnion" { // union remote can Move but returns CantMove error
|
||||
// If no server side Move, we are falling back to Copy + Delete
|
||||
assert.Equal(t, int64(1), accounting.GlobalStats().GetTransfers()) // 1 copy
|
||||
assert.Equal(t, int64(4), accounting.GlobalStats().GetChecks()) // 2 file checks + 1 move + 1 delete
|
||||
} else {
|
||||
assert.Equal(t, int64(0), accounting.GlobalStats().GetTransfers()) // 0 copy
|
||||
assert.Equal(t, int64(3), accounting.GlobalStats().GetChecks()) // 2 file checks + 1 move
|
||||
}
|
||||
} else {
|
||||
if toyFileChecks(r) != -1 {
|
||||
assert.Equal(t, toyFileChecks(r), accounting.GlobalStats().GetChecks())
|
||||
}
|
||||
assert.Equal(t, toyFileTransfers(r), accounting.GlobalStats().GetTransfers())
|
||||
}
|
||||
}
|
||||
|
||||
func toyFileChecks(r *fstest.Run) int64 {
|
||||
// As currently there is no Fs interface providing number of chunks
|
||||
// in a file, this test depends on the well-known names of test remotes.
|
||||
remote := r.Fremote.Name()
|
||||
// Numbers below are calculated for a 14 byte file.
|
||||
if !strings.HasPrefix(remote, "TestChunker") {
|
||||
return 2
|
||||
}
|
||||
// Chunker makes more internal checks.
|
||||
|
||||
// Union remote can Move but returns CantMove error.
|
||||
moveAsCopyDelete := r.Fremote.Features().Move == nil || remote == "TestUnion"
|
||||
|
||||
chunker := strings.HasPrefix(remote, "TestChunker")
|
||||
wrappedMoveAsCopyDelete := chunker && strings.HasSuffix(remote, "S3")
|
||||
|
||||
chunk3b := chunker && strings.Contains(remote, "Chunk3b") // chunker with 3 byte chunks
|
||||
chunk50b := chunker && strings.Contains(remote, "Chunk50b") // chunker with 50 byte chunks
|
||||
chunkDefault := chunker && !strings.Contains(remote, "ChunkerChunk") // default big chunk size
|
||||
chunkBig := chunk50b || chunkDefault // file is smaller than chunk size
|
||||
|
||||
// Verify number of checks for a toy 14 byte file.
|
||||
// The order of cases matters!
|
||||
var checks int
|
||||
switch {
|
||||
case strings.Contains(remote, "Chunk3b"): // chunk 3 bytes
|
||||
checks = 6
|
||||
case strings.Contains(remote, "Chunk50b"): // chunk 50 bytes
|
||||
checks = 3
|
||||
case strings.Contains(remote, "ChunkerChunk"): // unknown chunk size
|
||||
return -1
|
||||
case canTrackRenames && chunk3b:
|
||||
checks = 8 // chunker makes extra checks for each small chunk
|
||||
case canTrackRenames && chunkBig:
|
||||
checks = 4 // chunker makes 1 extra check for a single big chunk
|
||||
case canTrackRenames && moveAsCopyDelete:
|
||||
checks = 4 // 2 file checks + 1 move + 1 delete
|
||||
case canTrackRenames:
|
||||
checks = 3 // 2 file checks + 1 move
|
||||
case !chunker:
|
||||
checks = 2 // 2 file checks on a generic non-chunking remote
|
||||
case chunk3b:
|
||||
checks = 6 // chunker makes extra checks for each small chunk
|
||||
case chunkBig && wrappedMoveAsCopyDelete:
|
||||
checks = 4 // one more extra check because S3 emulates Move as Copy+Delete
|
||||
case chunkBig:
|
||||
checks = 3 // chunker makes 1 extra check for a single big chunk
|
||||
default:
|
||||
checks = 3 // large chunks (eventually no chunking)
|
||||
checks = -1 // skip verification for chunker with unknown chunk size
|
||||
}
|
||||
if strings.HasSuffix(remote, "S3") {
|
||||
checks++ // Extra check because S3 emulates Move as Copy+Delete.
|
||||
if checks != -1 { // "-1" allows remotes to bypass this check
|
||||
assert.Equal(t, int64(checks), accounting.GlobalStats().GetChecks())
|
||||
}
|
||||
|
||||
// Verify number of copy operations for a toy 14 byte file.
|
||||
// The order of cases matters!
|
||||
var copies int64
|
||||
switch {
|
||||
case canTrackRenames && moveAsCopyDelete:
|
||||
copies = 1 // 1 copy
|
||||
case canTrackRenames:
|
||||
copies = 0 // 0 copy
|
||||
case chunkBig && wrappedMoveAsCopyDelete:
|
||||
copies = 2 // extra Copy because S3 emulates Move as Copy+Delete.
|
||||
default:
|
||||
copies = 1
|
||||
}
|
||||
if copies != -1 { // "-1" allows remotes to bypass this check
|
||||
assert.Equal(t, copies, accounting.GlobalStats().GetTransfers())
|
||||
}
|
||||
return int64(checks)
|
||||
}
|
||||
|
||||
func toyFileTransfers(r *fstest.Run) int64 {
|
||||
|
|
|
@ -33,9 +33,6 @@ backends:
|
|||
- backend: "chunker"
|
||||
remote: "TestChunkerNometaLocal:"
|
||||
fastlist: true
|
||||
- backend: "chunker"
|
||||
remote: "TestChunkerCompatLocal:"
|
||||
fastlist: true
|
||||
- backend: "chunker"
|
||||
remote: "TestChunkerChunk3bLocal:"
|
||||
fastlist: true
|
||||
|
@ -44,10 +41,6 @@ backends:
|
|||
remote: "TestChunkerChunk3bNometaLocal:"
|
||||
fastlist: true
|
||||
maxfile: 6k
|
||||
- backend: "chunker"
|
||||
remote: "TestChunkerChunk3bCompatLocal:"
|
||||
fastlist: true
|
||||
maxfile: 6k
|
||||
- backend: "chunker"
|
||||
remote: "TestChunkerMailru:"
|
||||
fastlist: true
|
||||
|
@ -66,30 +59,26 @@ backends:
|
|||
- backend: "chunker"
|
||||
remote: "TestChunkerS3:"
|
||||
fastlist: true
|
||||
ignore:
|
||||
- TestIntegration/FsMkdir/FsPutFiles/SetTier
|
||||
- backend: "chunker"
|
||||
remote: "TestChunkerChunk50bS3:"
|
||||
fastlist: true
|
||||
maxfile: 1k
|
||||
ignore:
|
||||
- TestIntegration/FsMkdir/FsPutFiles/SetTier
|
||||
#- backend: "chunker"
|
||||
# remote: "TestChunkerChunk50bMD5HashS3:"
|
||||
# fastlist: true
|
||||
# maxfile: 1k
|
||||
#- backend: "chunker"
|
||||
# remote: "TestChunkerChunk50bMD5QuickS3:"
|
||||
# fastlist: true
|
||||
# maxfile: 1k
|
||||
#- backend: "chunker"
|
||||
# remote: "TestChunkerChunk50bSHA1HashS3:"
|
||||
# fastlist: true
|
||||
# maxfile: 1k
|
||||
#- backend: "chunker"
|
||||
# remote: "TestChunkerChunk50bSHA1QuickS3:"
|
||||
# fastlist: true
|
||||
# maxfile: 1k
|
||||
- backend: "chunker"
|
||||
remote: "TestChunkerChunk50bMD5HashS3:"
|
||||
fastlist: true
|
||||
maxfile: 1k
|
||||
- backend: "chunker"
|
||||
remote: "TestChunkerChunk50bSHA1HashS3:"
|
||||
fastlist: true
|
||||
maxfile: 1k
|
||||
- backend: "chunker"
|
||||
remote: "TestChunkerChunk50bMD5QuickS3:"
|
||||
fastlist: true
|
||||
maxfile: 1k
|
||||
- backend: "chunker"
|
||||
remote: "TestChunkerChunk50bSHA1QuickS3:"
|
||||
fastlist: true
|
||||
maxfile: 1k
|
||||
## end chunker
|
||||
- backend: "drive"
|
||||
remote: "TestDrive:"
|
||||
|
|
Loading…
Reference in a new issue