bisync: Add experimental --resilient mode to allow recovery from self-correctable errors

https://forum.rclone.org/t/bisync-bugs-and-feature-requests/37636#:~:text=2.%20Bisync%20should%20be%20more%20resilient%20to%20self%2Dcorrectable%20errors
This commit is contained in:
nielash 2023-07-11 06:57:49 -04:00 committed by Nick Craig-Wood
parent f01a50eb47
commit e5bde42303
5 changed files with 54 additions and 7 deletions

View file

@ -40,6 +40,7 @@ type Options struct {
NoCleanup bool NoCleanup bool
SaveQueues bool // save extra debugging files (test only flag) SaveQueues bool // save extra debugging files (test only flag)
IgnoreListingChecksum bool IgnoreListingChecksum bool
Resilient bool
} }
// Default values // Default values
@ -110,6 +111,7 @@ func init() {
flags.BoolVarP(cmdFlags, &tzLocal, "localtime", "", tzLocal, "Use local time in listings (default: UTC)", "") flags.BoolVarP(cmdFlags, &tzLocal, "localtime", "", tzLocal, "Use local time in listings (default: UTC)", "")
flags.BoolVarP(cmdFlags, &Opt.NoCleanup, "no-cleanup", "", Opt.NoCleanup, "Retain working files (useful for troubleshooting and testing).", "") flags.BoolVarP(cmdFlags, &Opt.NoCleanup, "no-cleanup", "", Opt.NoCleanup, "Retain working files (useful for troubleshooting and testing).", "")
flags.BoolVarP(cmdFlags, &Opt.IgnoreListingChecksum, "ignore-listing-checksum", "", Opt.IgnoreListingChecksum, "Do not use checksums for listings (add --ignore-checksum to additionally skip post-copy checksum checks)", "") flags.BoolVarP(cmdFlags, &Opt.IgnoreListingChecksum, "ignore-listing-checksum", "", Opt.IgnoreListingChecksum, "Do not use checksums for listings (add --ignore-checksum to additionally skip post-copy checksum checks)", "")
flags.BoolVarP(cmdFlags, &Opt.Resilient, "resilient", "", Opt.Resilient, "Allow future runs to retry after certain less-serious errors, instead of requiring --resync. Use at your own risk!", "")
} }
// bisync command definition // bisync command definition

View file

@ -104,6 +104,7 @@ func (b *bisyncRun) checkconflicts(ctxCheck context.Context, filterCheck *filter
opt, close, checkopterr := check.GetCheckOpt(b.fs1, b.fs2) opt, close, checkopterr := check.GetCheckOpt(b.fs1, b.fs2)
if checkopterr != nil { if checkopterr != nil {
b.critical = true b.critical = true
b.retryable = true
fs.Debugf(nil, "GetCheckOpt error: %v", checkopterr) fs.Debugf(nil, "GetCheckOpt error: %v", checkopterr)
return matches, checkopterr return matches, checkopterr
} }

View file

@ -301,5 +301,6 @@ func (b *bisyncRun) checkListing(ls *fileList, listing, msg string) error {
} }
fs.Errorf(nil, "Empty %s listing. Cannot sync to an empty directory: %s", msg, listing) fs.Errorf(nil, "Empty %s listing. Cannot sync to an empty directory: %s", msg, listing)
b.critical = true b.critical = true
b.retryable = true
return fmt.Errorf("empty %s listing: %s", msg, listing) return fmt.Errorf("empty %s listing: %s", msg, listing)
} }

View file

@ -25,13 +25,14 @@ var ErrBisyncAborted = errors.New("bisync aborted")
// bisyncRun keeps bisync runtime state // bisyncRun keeps bisync runtime state
type bisyncRun struct { type bisyncRun struct {
fs1 fs.Fs fs1 fs.Fs
fs2 fs.Fs fs2 fs.Fs
abort bool abort bool
critical bool critical bool
basePath string retryable bool
workDir string basePath string
opt *Options workDir string
opt *Options
} }
// Bisync handles lock file, performs bisync run and checks exit status // Bisync handles lock file, performs bisync run and checks exit status
@ -123,6 +124,10 @@ func Bisync(ctx context.Context, fs1, fs2 fs.Fs, optArg *Options) (err error) {
} }
if b.critical { if b.critical {
if b.retryable && b.opt.Resilient {
fs.Errorf(nil, "Bisync critical error: %v", err)
fs.Errorf(nil, "Bisync aborted. Error is retryable without --resync due to --resilient mode.")
} else {
if bilib.FileExists(listing1) { if bilib.FileExists(listing1) {
_ = os.Rename(listing1, listing1+"-err") _ = os.Rename(listing1, listing1+"-err")
} }
@ -131,6 +136,7 @@ func Bisync(ctx context.Context, fs1, fs2 fs.Fs, optArg *Options) (err error) {
} }
fs.Errorf(nil, "Bisync critical error: %v", err) fs.Errorf(nil, "Bisync critical error: %v", err)
fs.Errorf(nil, "Bisync aborted. Must run --resync to recover.") fs.Errorf(nil, "Bisync aborted. Must run --resync to recover.")
}
return ErrBisyncAborted return ErrBisyncAborted
} }
if b.abort { if b.abort {
@ -152,6 +158,7 @@ func (b *bisyncRun) runLocked(octx context.Context, listing1, listing2 string) (
fs.Infof(nil, "Validating listings for Path1 %s vs Path2 %s", quotePath(path1), quotePath(path2)) fs.Infof(nil, "Validating listings for Path1 %s vs Path2 %s", quotePath(path1), quotePath(path2))
if err = b.checkSync(listing1, listing2); err != nil { if err = b.checkSync(listing1, listing2); err != nil {
b.critical = true b.critical = true
b.retryable = true
} }
return err return err
} }
@ -176,6 +183,7 @@ func (b *bisyncRun) runLocked(octx context.Context, listing1, listing2 string) (
var fctx context.Context var fctx context.Context
if fctx, err = b.opt.applyFilters(octx); err != nil { if fctx, err = b.opt.applyFilters(octx); err != nil {
b.critical = true b.critical = true
b.retryable = true
return return
} }
@ -188,6 +196,7 @@ func (b *bisyncRun) runLocked(octx context.Context, listing1, listing2 string) (
if !bilib.FileExists(listing1) || !bilib.FileExists(listing2) { if !bilib.FileExists(listing1) || !bilib.FileExists(listing2) {
// On prior critical error abort, the prior listings are renamed to .lst-err to lock out further runs // On prior critical error abort, the prior listings are renamed to .lst-err to lock out further runs
b.critical = true b.critical = true
b.retryable = true
return errors.New("cannot find prior Path1 or Path2 listings, likely due to critical error on prior run") return errors.New("cannot find prior Path1 or Path2 listings, likely due to critical error on prior run")
} }
@ -215,6 +224,7 @@ func (b *bisyncRun) runLocked(octx context.Context, listing1, listing2 string) (
err = b.checkAccess(ds1.checkFiles, ds2.checkFiles) err = b.checkAccess(ds1.checkFiles, ds2.checkFiles)
if err != nil { if err != nil {
b.critical = true b.critical = true
b.retryable = true
return return
} }
} }
@ -255,6 +265,7 @@ func (b *bisyncRun) runLocked(octx context.Context, listing1, listing2 string) (
changes1, changes2, err = b.applyDeltas(octx, ds1, ds2) changes1, changes2, err = b.applyDeltas(octx, ds1, ds2)
if err != nil { if err != nil {
b.critical = true b.critical = true
// b.retryable = true // not sure about this one
return err return err
} }
} }
@ -283,6 +294,7 @@ func (b *bisyncRun) runLocked(octx context.Context, listing1, listing2 string) (
} }
if err != nil { if err != nil {
b.critical = true b.critical = true
b.retryable = true
return err return err
} }
@ -310,6 +322,7 @@ func (b *bisyncRun) runLocked(octx context.Context, listing1, listing2 string) (
} }
if err != nil { if err != nil {
b.critical = true b.critical = true
b.retryable = true
return err return err
} }
} }
@ -369,6 +382,7 @@ func (b *bisyncRun) resync(octx, fctx context.Context, listing1, listing2 string
err = b.checkAccess(ds1.checkFiles, ds2.checkFiles) err = b.checkAccess(ds1.checkFiles, ds2.checkFiles)
if err != nil { if err != nil {
b.critical = true b.critical = true
b.retryable = true
return err return err
} }
} }

View file

@ -97,6 +97,8 @@ Optional Flags:
Consider using `--verbose` or `--dry-run` first. Consider using `--verbose` or `--dry-run` first.
--ignore-listing-checksum Do not use checksums for listings --ignore-listing-checksum Do not use checksums for listings
(add --ignore-checksum to additionally skip post-copy checksum checks) (add --ignore-checksum to additionally skip post-copy checksum checks)
--resilient Allow future runs to retry after certain less-serious errors,
instead of requiring --resync. Use at your own risk!
--localtime Use local time in listings (default: UTC) --localtime Use local time in listings (default: UTC)
--no-cleanup Retain working files (useful for troubleshooting and testing). --no-cleanup Retain working files (useful for troubleshooting and testing).
--workdir PATH Use custom working directory (useful for testing). --workdir PATH Use custom working directory (useful for testing).
@ -285,6 +287,30 @@ were generated on a run using `--ignore-listing-checksum`. For a more robust int
consider using [`check`](commands/rclone_check/) consider using [`check`](commands/rclone_check/)
(or [`cryptcheck`](/commands/rclone_cryptcheck/), if at least one path is a `crypt` remote.) (or [`cryptcheck`](/commands/rclone_cryptcheck/), if at least one path is a `crypt` remote.)
#### --resilient
***Caution: this is an experimental feature. Use at your own risk!***
By default, most errors or interruptions will cause bisync to abort and
require [`--resync`](#resync) to recover. This is a safety feature,
to prevent bisync from running again until a user checks things out.
However, in some cases, bisync can go too far and enforce a lockout when one isn't actually necessary,
like for certain less-serious errors that might resolve themselves on the next run.
When `--resilient` is specified, bisync tries its best to recover and self-correct,
and only requires `--resync` as a last resort when a human's involvement is absolutely necessary.
The intended use case is for running bisync as a background process (such as via scheduled [cron](#cron)).
When using `--resilient` mode, bisync will still report the error and abort,
however it will not lock out future runs -- allowing the possibility of retrying at the next normally scheduled time,
without requiring a `--resync` first. Examples of such retryable errors include
access test failures, missing listing files, and filter change detections.
These safety features will still prevent the *current* run from proceeding --
the difference is that if conditions have improved by the time of the *next* run,
that next run will be allowed to proceed.
Certain more serious errors will still enforce a `--resync` lockout, even in `--resilient` mode, to prevent data loss.
Behavior of `--resilient` may change in a future version.
## Operation ## Operation
### Runtime flow details ### Runtime flow details
@ -394,6 +420,8 @@ typically at `${HOME}/.cache/rclone/bisync/` on Linux.
Some errors are considered temporary and re-running the bisync is not blocked. Some errors are considered temporary and re-running the bisync is not blocked.
The _critical return_ blocks further bisync runs. The _critical return_ blocks further bisync runs.
See also: [`--resilient`](#resilient)
### Lock file ### Lock file
When bisync is running, a lock file is created in the bisync working directory, When bisync is running, a lock file is created in the bisync working directory,
@ -1164,5 +1192,6 @@ causing dry runs to inadvertently commit filter changes
causing bisync to consider more files than necessary due to overbroad filters during delete operations causing bisync to consider more files than necessary due to overbroad filters during delete operations
* [Improved detection of false positive change conflicts](https://forum.rclone.org/t/bisync-bugs-and-feature-requests/37636#:~:text=1.%20Identical%20files%20should%20be%20left%20alone%2C%20even%20if%20new/newer/changed%20on%20both%20sides) * [Improved detection of false positive change conflicts](https://forum.rclone.org/t/bisync-bugs-and-feature-requests/37636#:~:text=1.%20Identical%20files%20should%20be%20left%20alone%2C%20even%20if%20new/newer/changed%20on%20both%20sides)
(identical files are now left alone instead of renamed) (identical files are now left alone instead of renamed)
* Added experimental `--resilient` mode to allow [recovery from self-correctable errors](https://forum.rclone.org/t/bisync-bugs-and-feature-requests/37636#:~:text=2.%20Bisync%20should%20be%20more%20resilient%20to%20self%2Dcorrectable%20errors)
* Added [new `--ignore-listing-checksum` flag](https://forum.rclone.org/t/bisync-bugs-and-feature-requests/37636#:~:text=6.%20%2D%2Dignore%2Dchecksum%20should%20be%20split%20into%20two%20flags%20for%20separate%20purposes) * Added [new `--ignore-listing-checksum` flag](https://forum.rclone.org/t/bisync-bugs-and-feature-requests/37636#:~:text=6.%20%2D%2Dignore%2Dchecksum%20should%20be%20split%20into%20two%20flags%20for%20separate%20purposes)
to distinguish from `--ignore-checksum` to distinguish from `--ignore-checksum`