forked from TrueCloudLab/rclone
bisync: Add experimental --resilient mode to allow recovery from self-correctable errors
https://forum.rclone.org/t/bisync-bugs-and-feature-requests/37636#:~:text=2.%20Bisync%20should%20be%20more%20resilient%20to%20self%2Dcorrectable%20errors
This commit is contained in:
parent
f01a50eb47
commit
e5bde42303
5 changed files with 54 additions and 7 deletions
|
@ -40,6 +40,7 @@ type Options struct {
|
|||
NoCleanup bool
|
||||
SaveQueues bool // save extra debugging files (test only flag)
|
||||
IgnoreListingChecksum bool
|
||||
Resilient bool
|
||||
}
|
||||
|
||||
// Default values
|
||||
|
@ -110,6 +111,7 @@ func init() {
|
|||
flags.BoolVarP(cmdFlags, &tzLocal, "localtime", "", tzLocal, "Use local time in listings (default: UTC)", "")
|
||||
flags.BoolVarP(cmdFlags, &Opt.NoCleanup, "no-cleanup", "", Opt.NoCleanup, "Retain working files (useful for troubleshooting and testing).", "")
|
||||
flags.BoolVarP(cmdFlags, &Opt.IgnoreListingChecksum, "ignore-listing-checksum", "", Opt.IgnoreListingChecksum, "Do not use checksums for listings (add --ignore-checksum to additionally skip post-copy checksum checks)", "")
|
||||
flags.BoolVarP(cmdFlags, &Opt.Resilient, "resilient", "", Opt.Resilient, "Allow future runs to retry after certain less-serious errors, instead of requiring --resync. Use at your own risk!", "")
|
||||
}
|
||||
|
||||
// bisync command definition
|
||||
|
|
|
@ -104,6 +104,7 @@ func (b *bisyncRun) checkconflicts(ctxCheck context.Context, filterCheck *filter
|
|||
opt, close, checkopterr := check.GetCheckOpt(b.fs1, b.fs2)
|
||||
if checkopterr != nil {
|
||||
b.critical = true
|
||||
b.retryable = true
|
||||
fs.Debugf(nil, "GetCheckOpt error: %v", checkopterr)
|
||||
return matches, checkopterr
|
||||
}
|
||||
|
|
|
@ -301,5 +301,6 @@ func (b *bisyncRun) checkListing(ls *fileList, listing, msg string) error {
|
|||
}
|
||||
fs.Errorf(nil, "Empty %s listing. Cannot sync to an empty directory: %s", msg, listing)
|
||||
b.critical = true
|
||||
b.retryable = true
|
||||
return fmt.Errorf("empty %s listing: %s", msg, listing)
|
||||
}
|
||||
|
|
|
@ -25,13 +25,14 @@ var ErrBisyncAborted = errors.New("bisync aborted")
|
|||
|
||||
// bisyncRun keeps bisync runtime state
|
||||
type bisyncRun struct {
|
||||
fs1 fs.Fs
|
||||
fs2 fs.Fs
|
||||
abort bool
|
||||
critical bool
|
||||
basePath string
|
||||
workDir string
|
||||
opt *Options
|
||||
fs1 fs.Fs
|
||||
fs2 fs.Fs
|
||||
abort bool
|
||||
critical bool
|
||||
retryable bool
|
||||
basePath string
|
||||
workDir string
|
||||
opt *Options
|
||||
}
|
||||
|
||||
// Bisync handles lock file, performs bisync run and checks exit status
|
||||
|
@ -123,6 +124,10 @@ func Bisync(ctx context.Context, fs1, fs2 fs.Fs, optArg *Options) (err error) {
|
|||
}
|
||||
|
||||
if b.critical {
|
||||
if b.retryable && b.opt.Resilient {
|
||||
fs.Errorf(nil, "Bisync critical error: %v", err)
|
||||
fs.Errorf(nil, "Bisync aborted. Error is retryable without --resync due to --resilient mode.")
|
||||
} else {
|
||||
if bilib.FileExists(listing1) {
|
||||
_ = os.Rename(listing1, listing1+"-err")
|
||||
}
|
||||
|
@ -131,6 +136,7 @@ func Bisync(ctx context.Context, fs1, fs2 fs.Fs, optArg *Options) (err error) {
|
|||
}
|
||||
fs.Errorf(nil, "Bisync critical error: %v", err)
|
||||
fs.Errorf(nil, "Bisync aborted. Must run --resync to recover.")
|
||||
}
|
||||
return ErrBisyncAborted
|
||||
}
|
||||
if b.abort {
|
||||
|
@ -152,6 +158,7 @@ func (b *bisyncRun) runLocked(octx context.Context, listing1, listing2 string) (
|
|||
fs.Infof(nil, "Validating listings for Path1 %s vs Path2 %s", quotePath(path1), quotePath(path2))
|
||||
if err = b.checkSync(listing1, listing2); err != nil {
|
||||
b.critical = true
|
||||
b.retryable = true
|
||||
}
|
||||
return err
|
||||
}
|
||||
|
@ -176,6 +183,7 @@ func (b *bisyncRun) runLocked(octx context.Context, listing1, listing2 string) (
|
|||
var fctx context.Context
|
||||
if fctx, err = b.opt.applyFilters(octx); err != nil {
|
||||
b.critical = true
|
||||
b.retryable = true
|
||||
return
|
||||
}
|
||||
|
||||
|
@ -188,6 +196,7 @@ func (b *bisyncRun) runLocked(octx context.Context, listing1, listing2 string) (
|
|||
if !bilib.FileExists(listing1) || !bilib.FileExists(listing2) {
|
||||
// On prior critical error abort, the prior listings are renamed to .lst-err to lock out further runs
|
||||
b.critical = true
|
||||
b.retryable = true
|
||||
return errors.New("cannot find prior Path1 or Path2 listings, likely due to critical error on prior run")
|
||||
}
|
||||
|
||||
|
@ -215,6 +224,7 @@ func (b *bisyncRun) runLocked(octx context.Context, listing1, listing2 string) (
|
|||
err = b.checkAccess(ds1.checkFiles, ds2.checkFiles)
|
||||
if err != nil {
|
||||
b.critical = true
|
||||
b.retryable = true
|
||||
return
|
||||
}
|
||||
}
|
||||
|
@ -255,6 +265,7 @@ func (b *bisyncRun) runLocked(octx context.Context, listing1, listing2 string) (
|
|||
changes1, changes2, err = b.applyDeltas(octx, ds1, ds2)
|
||||
if err != nil {
|
||||
b.critical = true
|
||||
// b.retryable = true // not sure about this one
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
@ -283,6 +294,7 @@ func (b *bisyncRun) runLocked(octx context.Context, listing1, listing2 string) (
|
|||
}
|
||||
if err != nil {
|
||||
b.critical = true
|
||||
b.retryable = true
|
||||
return err
|
||||
}
|
||||
|
||||
|
@ -310,6 +322,7 @@ func (b *bisyncRun) runLocked(octx context.Context, listing1, listing2 string) (
|
|||
}
|
||||
if err != nil {
|
||||
b.critical = true
|
||||
b.retryable = true
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
@ -369,6 +382,7 @@ func (b *bisyncRun) resync(octx, fctx context.Context, listing1, listing2 string
|
|||
err = b.checkAccess(ds1.checkFiles, ds2.checkFiles)
|
||||
if err != nil {
|
||||
b.critical = true
|
||||
b.retryable = true
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
|
|
@ -97,6 +97,8 @@ Optional Flags:
|
|||
Consider using `--verbose` or `--dry-run` first.
|
||||
--ignore-listing-checksum Do not use checksums for listings
|
||||
(add --ignore-checksum to additionally skip post-copy checksum checks)
|
||||
--resilient Allow future runs to retry after certain less-serious errors,
|
||||
instead of requiring --resync. Use at your own risk!
|
||||
--localtime Use local time in listings (default: UTC)
|
||||
--no-cleanup Retain working files (useful for troubleshooting and testing).
|
||||
--workdir PATH Use custom working directory (useful for testing).
|
||||
|
@ -285,6 +287,30 @@ were generated on a run using `--ignore-listing-checksum`. For a more robust int
|
|||
consider using [`check`](commands/rclone_check/)
|
||||
(or [`cryptcheck`](/commands/rclone_cryptcheck/), if at least one path is a `crypt` remote.)
|
||||
|
||||
#### --resilient
|
||||
|
||||
***Caution: this is an experimental feature. Use at your own risk!***
|
||||
|
||||
By default, most errors or interruptions will cause bisync to abort and
|
||||
require [`--resync`](#resync) to recover. This is a safety feature,
|
||||
to prevent bisync from running again until a user checks things out.
|
||||
However, in some cases, bisync can go too far and enforce a lockout when one isn't actually necessary,
|
||||
like for certain less-serious errors that might resolve themselves on the next run.
|
||||
When `--resilient` is specified, bisync tries its best to recover and self-correct,
|
||||
and only requires `--resync` as a last resort when a human's involvement is absolutely necessary.
|
||||
The intended use case is for running bisync as a background process (such as via scheduled [cron](#cron)).
|
||||
|
||||
When using `--resilient` mode, bisync will still report the error and abort,
|
||||
however it will not lock out future runs -- allowing the possibility of retrying at the next normally scheduled time,
|
||||
without requiring a `--resync` first. Examples of such retryable errors include
|
||||
access test failures, missing listing files, and filter change detections.
|
||||
These safety features will still prevent the *current* run from proceeding --
|
||||
the difference is that if conditions have improved by the time of the *next* run,
|
||||
that next run will be allowed to proceed.
|
||||
Certain more serious errors will still enforce a `--resync` lockout, even in `--resilient` mode, to prevent data loss.
|
||||
|
||||
Behavior of `--resilient` may change in a future version.
|
||||
|
||||
## Operation
|
||||
|
||||
### Runtime flow details
|
||||
|
@ -394,6 +420,8 @@ typically at `${HOME}/.cache/rclone/bisync/` on Linux.
|
|||
Some errors are considered temporary and re-running the bisync is not blocked.
|
||||
The _critical return_ blocks further bisync runs.
|
||||
|
||||
See also: [`--resilient`](#resilient)
|
||||
|
||||
### Lock file
|
||||
|
||||
When bisync is running, a lock file is created in the bisync working directory,
|
||||
|
@ -1164,5 +1192,6 @@ causing dry runs to inadvertently commit filter changes
|
|||
causing bisync to consider more files than necessary due to overbroad filters during delete operations
|
||||
* [Improved detection of false positive change conflicts](https://forum.rclone.org/t/bisync-bugs-and-feature-requests/37636#:~:text=1.%20Identical%20files%20should%20be%20left%20alone%2C%20even%20if%20new/newer/changed%20on%20both%20sides)
|
||||
(identical files are now left alone instead of renamed)
|
||||
* Added experimental `--resilient` mode to allow [recovery from self-correctable errors](https://forum.rclone.org/t/bisync-bugs-and-feature-requests/37636#:~:text=2.%20Bisync%20should%20be%20more%20resilient%20to%20self%2Dcorrectable%20errors)
|
||||
* Added [new `--ignore-listing-checksum` flag](https://forum.rclone.org/t/bisync-bugs-and-feature-requests/37636#:~:text=6.%20%2D%2Dignore%2Dchecksum%20should%20be%20split%20into%20two%20flags%20for%20separate%20purposes)
|
||||
to distinguish from `--ignore-checksum`
|
||||
|
|
Loading…
Reference in a new issue