dropbox: add --dropbox-batch-mode flag to speed up uploading #5156

This adds 3 upload modes for dropbox off, sync and async and makes sync the default. This should improve uploads (especially for small files) greatly.
2020-09-07 11:08:46 +01:00 · 2020-09-07 11:08:46 +01:00 · 2e4b65f888
commit 2e4b65f888
parent 77cda6773c
3 changed files with 533 additions and 20 deletions
--- a/backend/dropbox/dropbox.go
+++ b/backend/dropbox/dropbox.go
@ -22,6 +22,7 @@ of path_display and all will be well.
 */

 import (
+	"bytes"
 	"context"
 	"fmt"
 	"io"
@ -209,6 +210,63 @@ Note that we don't unmount the shared folder afterwards so the
 shared folder.`,
 			Default:  false,
 			Advanced: true,
+		}, {
+			Name: "batch_mode",
+			Help: `Upload file batching sync|async|off.
+
+This sets the batch mode used by rclone.
+
+For full info see [the main docs](https://rclone.org/dropbox/#batch-mode)
+
+This has 3 possible values
+
+- off - no batching
+- sync - batch uploads and check completion (default)
+- async - batch upload and don't check completion
+
+Rclone will close any outstanding batches when it exits which may make
+a delay on quit.
+`,
+			Default:  "sync",
+			Advanced: true,
+		}, {
+			Name: "batch_size",
+			Help: `Max number of files in upload batch.
+
+This sets the batch size of files to upload. It has to be less than 1000.
+
+By default this is 0 which means rclone which calculate the batch size
+depending on the setting of batch_mode.
+
+- batch_mode: async - default batch_size is 100
+- batch_mode: sync - default batch_size is the same as --transfers
+- batch_mode: off - not in use
+
+Rclone will close any outstanding batches when it exits which may make
+a delay on quit.
+
+Setting this is a great idea if you are uploading lots of small files
+as it will make them a lot quicker. You can use --transfers 32 to
+maximise throughput.
+`,
+			Default:  0,
+			Advanced: true,
+		}, {
+			Name: "batch_timeout",
+			Help: `Max time to allow an idle upload batch before uploading
+
+If an upload batch is idle for more than this long then it will be
+uploaded.
+
+The default for this is 0 which means rclone will choose a sensible
+default based on the batch_mode in use.
+
+- batch_mode: async - default batch_timeout is 500ms
+- batch_mode: sync - default batch_timeout is 10s
+- batch_mode: off - not in use
+`,
+			Default:  fs.Duration(0),
+			Advanced: true,
 		}, {
 			Name:     config.ConfigEncoding,
 			Help:     config.ConfigEncodingHelp,
@ -232,6 +290,10 @@ type Options struct {
 	Impersonate   string               `config:"impersonate"`
 	SharedFiles   bool                 `config:"shared_files"`
 	SharedFolders bool                 `config:"shared_folders"`
+	BatchMode     string               `config:"batch_mode"`
+	BatchSize     int                  `config:"batch_size"`
+	BatchTimeout  fs.Duration          `config:"batch_timeout"`
+	AsyncBatch    bool                 `config:"async_batch"`
 	Enc           encoder.MultiEncoder `config:"encoding"`
 }

@ -251,6 +313,7 @@ type Fs struct {
 	slashRootSlash string         // root with "/" prefix and postfix, lowercase
 	pacer          *fs.Pacer      // To pace the API calls
 	ns             string         // The namespace we are using or "" for none
+	batcher        *batcher       // batch builder
 }

 // Object describes a dropbox object
@ -266,8 +329,6 @@ type Object struct {
 	hash    string    // content_hash of the object
 }

-// ------------------------------------------------------------
-
 // Name of the remote (as passed into NewFs)
 func (f *Fs) Name() string {
 	return f.name
@ -378,6 +439,10 @@ func NewFs(ctx context.Context, name, root string, m configmap.Mapper) (fs.Fs, e
 		ci:    ci,
 		pacer: fs.NewPacer(ctx, pacer.NewDefault(pacer.MinSleep(minSleep), pacer.MaxSleep(maxSleep), pacer.DecayConstant(decayConstant))),
 	}
+	f.batcher, err = newBatcher(ctx, f, f.opt.BatchMode, f.opt.BatchSize, time.Duration(f.opt.BatchTimeout))
+	if err != nil {
+		return nil, err
+	}
 	cfg := dropbox.Config{
 		LogLevel:        dropbox.LogOff, // logging in the SDK: LogOff, LogDebug, LogInfo
 		Client:          oAuthClient,    // maybe???
@ -1377,6 +1442,13 @@ func (f *Fs) Hashes() hash.Set {
 	return hash.Set(DbHashType)
 }

+// Shutdown the backend, closing any background tasks and any
+// cached connections.
+func (f *Fs) Shutdown(ctx context.Context) error {
+	f.batcher.Shutdown()
+	return nil
+}
+
 // ------------------------------------------------------------

 // Fs returns the parent Fs
@ -1540,9 +1612,10 @@ func (o *Object) Open(ctx context.Context, options ...fs.OpenOption) (in io.Read
 // unknown (i.e. -1) or smaller than uploadChunkSize, the method incurs an
 // avoidable request to the Dropbox API that does not carry payload.
 func (o *Object) uploadChunked(ctx context.Context, in0 io.Reader, commitInfo *files.CommitInfo, size int64) (entry *files.FileMetadata, err error) {
+	batching := o.fs.batcher.Batching()
 	chunkSize := int64(o.fs.opt.ChunkSize)
 	chunks := 0
-	if size != -1 {
+	if size >= 0 {
 		chunks = int(size/chunkSize) + 1
 	}
 	in := readers.NewCountingReader(in0)
@ -1553,11 +1626,15 @@ func (o *Object) uploadChunked(ctx context.Context, in0 io.Reader, commitInfo *f
 			fs.Debugf(o, "Streaming chunk %d/%d", cur, cur)
 		} else if chunks == 0 {
 			fs.Debugf(o, "Streaming chunk %d/unknown", cur)
-		} else {
+		} else if chunks != 1 {
 			fs.Debugf(o, "Uploading chunk %d/%d", cur, chunks)
 		}
 	}

+	appendArg := files.UploadSessionAppendArg{
+		Close: chunks == 1,
+	}
+
 	// write the first chunk
 	fmtChunk(1, false)
 	var res *files.UploadSessionStartResult
@ -1567,7 +1644,10 @@ func (o *Object) uploadChunked(ctx context.Context, in0 io.Reader, commitInfo *f
 		if _, err = chunk.Seek(0, io.SeekStart); err != nil {
 			return false, nil
 		}
-		res, err = o.fs.srv.UploadSessionStart(&files.UploadSessionStartArg{}, chunk)
+		arg := files.UploadSessionStartArg{
+			Close: appendArg.Close,
+		}
+		res, err = o.fs.srv.UploadSessionStart(&arg, chunk)
 		return shouldRetry(ctx, err)
 	})
 	if err != nil {
@ -1578,22 +1658,34 @@ func (o *Object) uploadChunked(ctx context.Context, in0 io.Reader, commitInfo *f
 		SessionId: res.SessionId,
 		Offset:    0,
 	}
-	appendArg := files.UploadSessionAppendArg{
-		Cursor: &cursor,
-		Close:  false,
-	}
+	appendArg.Cursor = &cursor

-	// write more whole chunks (if any)
+	// write more whole chunks (if any, and if !batching), if
+	// batching write the last chunk also.
 	currentChunk := 2
 	for {
-		if chunks > 0 && currentChunk >= chunks {
-			// if the size is known, only upload full chunks. Remaining bytes are uploaded with
-			// the UploadSessionFinish request.
-			break
-		} else if chunks == 0 && in.BytesRead()-cursor.Offset < uint64(chunkSize) {
-			// if the size is unknown, upload as long as we can read full chunks from the reader.
-			// The UploadSessionFinish request will not contain any payload.
-			break
+		if chunks > 0 {
+			// Size known
+			if currentChunk == chunks {
+				// Last chunk
+				if !batching {
+					// if the size is known, only upload full chunks. Remaining bytes are uploaded with
+					// the UploadSessionFinish request.
+					break
+				}
+				appendArg.Close = true
+			} else if currentChunk > chunks {
+				break
+			}
+		} else {
+			// Size unknown
+			lastReadWasShort := in.BytesRead()-cursor.Offset < uint64(chunkSize)
+			if lastReadWasShort {
+				// if the size is unknown, upload as long as we can read full chunks from the reader.
+				// The UploadSessionFinish request will not contain any payload.
+				// This is also what we want if batching
+				break
+			}
 		}
 		cursor.Offset = in.BytesRead()
 		fmtChunk(currentChunk, false)
@ -1619,6 +1711,26 @@ func (o *Object) uploadChunked(ctx context.Context, in0 io.Reader, commitInfo *f
 		Cursor: &cursor,
 		Commit: commitInfo,
 	}
+	// If we are batching then we should have written all the data now
+	// store the commit info now for a batch commit
+	if batching {
+		// If we haven't closed the session then we need to
+		if !appendArg.Close {
+			appendArg.Close = true
+			fs.Debugf(o, "Closing session")
+			var empty bytes.Buffer
+			err = o.fs.pacer.Call(func() (bool, error) {
+				err = o.fs.srv.UploadSessionAppendV2(&appendArg, &empty)
+				// after the first chunk is uploaded, we retry everything
+				return err != nil, err
+			})
+			if err != nil {
+				return nil, err
+			}
+		}
+		return o.fs.batcher.Commit(ctx, args)
+	}
+
 	fmtChunk(currentChunk, true)
 	chunk = readers.NewRepeatableReaderBuffer(in, buf)
 	err = o.fs.pacer.Call(func() (bool, error) {
@ -1693,7 +1805,7 @@ func (o *Object) Update(ctx context.Context, in io.Reader, src fs.ObjectInfo, op
 	size := src.Size()
 	var err error
 	var entry *files.FileMetadata
-	if size > int64(o.fs.opt.ChunkSize) || size == -1 {
+	if size > int64(o.fs.opt.ChunkSize) || size < 0 || o.fs.batcher.Batching() {
 		entry, err = o.uploadChunked(ctx, in, commitInfo, size)
 	} else {
 		err = o.fs.pacer.CallNoRetry(func() (bool, error) {
@ -1704,6 +1816,15 @@ func (o *Object) Update(ctx context.Context, in io.Reader, src fs.ObjectInfo, op
 	if err != nil {
 		return errors.Wrap(err, "upload failed")
 	}
+	// If we haven't received data back from batch upload then fake it
+	//
+	// This will only happen if we are uploading async batches
+	if entry == nil {
+		o.bytes = size
+		o.modTime = commitInfo.ClientModified
+		o.hash = "" // we don't have this
+		return nil
+	}
 	return o.setMetadataFromEntry(entry)
 }

@ -1731,6 +1852,7 @@ var (
 	_ fs.PublicLinker = (*Fs)(nil)
 	_ fs.DirMover     = (*Fs)(nil)
 	_ fs.Abouter      = (*Fs)(nil)
+	_ fs.Shutdowner   = &Fs{}
 	_ fs.Object       = (*Object)(nil)
 	_ fs.IDer         = (*Object)(nil)
 )