Merge pull request #4784 from MichaelEischer/rework-backend-retries
Rework backend retries
This commit is contained in:
commit
16ef4d515b
7 changed files with 198 additions and 49 deletions
|
@ -9,7 +9,9 @@ downloading are now forcibly interrupted. This ensures that stuck requests are
|
|||
retried after a short timeout.
|
||||
|
||||
Attempts to access a missing file or a truncated file will no longer be retried.
|
||||
This avoids unnecessary retries in those cases.
|
||||
This avoids unnecessary retries in those cases. All other backend requests are
|
||||
retried for up to 15 minutes. This ensures that a temporarily interrupted network
|
||||
connections can be tolerated.
|
||||
|
||||
If a download yields a corrupt file or blob, then the download will be retried once.
|
||||
|
||||
|
@ -26,3 +28,4 @@ https://github.com/restic/restic/issues/4515
|
|||
https://github.com/restic/restic/issues/1523
|
||||
https://github.com/restic/restic/pull/4520
|
||||
https://github.com/restic/restic/pull/4800
|
||||
https://github.com/restic/restic/pull/4784
|
||||
|
|
|
@ -416,12 +416,16 @@ func OpenRepository(ctx context.Context, opts GlobalOptions) (*repository.Reposi
|
|||
}
|
||||
|
||||
report := func(msg string, err error, d time.Duration) {
|
||||
Warnf("%v returned error, retrying after %v: %v\n", msg, d, err)
|
||||
if d >= 0 {
|
||||
Warnf("%v returned error, retrying after %v: %v\n", msg, d, err)
|
||||
} else {
|
||||
Warnf("%v failed: %v\n", msg, err)
|
||||
}
|
||||
}
|
||||
success := func(msg string, retries int) {
|
||||
Warnf("%v operation successful after %d retries\n", msg, retries)
|
||||
}
|
||||
be = retry.New(be, 10, report, success)
|
||||
be = retry.New(be, 15*time.Minute, report, success)
|
||||
|
||||
// wrap backend if a test specified a hook
|
||||
if opts.backendTestHook != nil {
|
||||
|
|
|
@ -18,9 +18,9 @@ import (
|
|||
// backoff.
|
||||
type Backend struct {
|
||||
backend.Backend
|
||||
MaxTries int
|
||||
Report func(string, error, time.Duration)
|
||||
Success func(string, int)
|
||||
MaxElapsedTime time.Duration
|
||||
Report func(string, error, time.Duration)
|
||||
Success func(string, int)
|
||||
|
||||
failedLoads sync.Map
|
||||
}
|
||||
|
@ -32,32 +32,64 @@ var _ backend.Backend = &Backend{}
|
|||
// backoff. report is called with a description and the error, if one occurred.
|
||||
// success is called with the number of retries before a successful operation
|
||||
// (it is not called if it succeeded on the first try)
|
||||
func New(be backend.Backend, maxTries int, report func(string, error, time.Duration), success func(string, int)) *Backend {
|
||||
func New(be backend.Backend, maxElapsedTime time.Duration, report func(string, error, time.Duration), success func(string, int)) *Backend {
|
||||
return &Backend{
|
||||
Backend: be,
|
||||
MaxTries: maxTries,
|
||||
Report: report,
|
||||
Success: success,
|
||||
Backend: be,
|
||||
MaxElapsedTime: maxElapsedTime,
|
||||
Report: report,
|
||||
Success: success,
|
||||
}
|
||||
}
|
||||
|
||||
// retryNotifyErrorWithSuccess is an extension of backoff.RetryNotify with notification of success after an error.
|
||||
// success is NOT notified on the first run of operation (only after an error).
|
||||
func retryNotifyErrorWithSuccess(operation backoff.Operation, b backoff.BackOff, notify backoff.Notify, success func(retries int)) error {
|
||||
var operationWrapper backoff.Operation
|
||||
if success == nil {
|
||||
return backoff.RetryNotify(operation, b, notify)
|
||||
}
|
||||
retries := 0
|
||||
operationWrapper := func() error {
|
||||
err := operation()
|
||||
if err != nil {
|
||||
retries++
|
||||
} else if retries > 0 {
|
||||
success(retries)
|
||||
operationWrapper = operation
|
||||
} else {
|
||||
retries := 0
|
||||
operationWrapper = func() error {
|
||||
err := operation()
|
||||
if err != nil {
|
||||
retries++
|
||||
} else if retries > 0 {
|
||||
success(retries)
|
||||
}
|
||||
return err
|
||||
}
|
||||
return err
|
||||
}
|
||||
return backoff.RetryNotify(operationWrapper, b, notify)
|
||||
err := backoff.RetryNotify(operationWrapper, b, notify)
|
||||
|
||||
if err != nil && notify != nil {
|
||||
// log final error
|
||||
notify(err, -1)
|
||||
}
|
||||
return err
|
||||
}
|
||||
|
||||
func withRetryAtLeastOnce(delegate *backoff.ExponentialBackOff) *retryAtLeastOnce {
|
||||
return &retryAtLeastOnce{delegate: delegate}
|
||||
}
|
||||
|
||||
type retryAtLeastOnce struct {
|
||||
delegate *backoff.ExponentialBackOff
|
||||
numTries uint64
|
||||
}
|
||||
|
||||
func (b *retryAtLeastOnce) NextBackOff() time.Duration {
|
||||
delay := b.delegate.NextBackOff()
|
||||
|
||||
b.numTries++
|
||||
if b.numTries == 1 && b.delegate.Stop == delay {
|
||||
return b.delegate.InitialInterval
|
||||
}
|
||||
return delay
|
||||
}
|
||||
|
||||
func (b *retryAtLeastOnce) Reset() {
|
||||
b.numTries = 0
|
||||
b.delegate.Reset()
|
||||
}
|
||||
|
||||
var fastRetries = false
|
||||
|
@ -74,9 +106,25 @@ func (be *Backend) retry(ctx context.Context, msg string, f func() error) error
|
|||
}
|
||||
|
||||
bo := backoff.NewExponentialBackOff()
|
||||
bo.MaxElapsedTime = be.MaxElapsedTime
|
||||
|
||||
if feature.Flag.Enabled(feature.BackendErrorRedesign) {
|
||||
bo.InitialInterval = 1 * time.Second
|
||||
bo.Multiplier = 2
|
||||
}
|
||||
if fastRetries {
|
||||
// speed up integration tests
|
||||
bo.InitialInterval = 1 * time.Millisecond
|
||||
maxElapsedTime := 200 * time.Millisecond
|
||||
if bo.MaxElapsedTime > maxElapsedTime {
|
||||
bo.MaxElapsedTime = maxElapsedTime
|
||||
}
|
||||
}
|
||||
|
||||
var b backoff.BackOff = withRetryAtLeastOnce(bo)
|
||||
if !feature.Flag.Enabled(feature.BackendErrorRedesign) {
|
||||
// deprecated behavior
|
||||
b = backoff.WithMaxRetries(b, 10)
|
||||
}
|
||||
|
||||
err := retryNotifyErrorWithSuccess(
|
||||
|
@ -89,7 +137,7 @@ func (be *Backend) retry(ctx context.Context, msg string, f func() error) error
|
|||
}
|
||||
return err
|
||||
},
|
||||
backoff.WithContext(backoff.WithMaxRetries(bo, uint64(be.MaxTries)), ctx),
|
||||
backoff.WithContext(b, ctx),
|
||||
func(err error, d time.Duration) {
|
||||
if be.Report != nil {
|
||||
be.Report(msg, err, d)
|
||||
|
|
|
@ -193,8 +193,9 @@ func TestBackendListRetryErrorBackend(t *testing.T) {
|
|||
}
|
||||
|
||||
TestFastRetries(t)
|
||||
const maxRetries = 2
|
||||
retryBackend := New(be, maxRetries, nil, nil)
|
||||
const maxElapsedTime = 10 * time.Millisecond
|
||||
now := time.Now()
|
||||
retryBackend := New(be, maxElapsedTime, nil, nil)
|
||||
|
||||
var listed []string
|
||||
err := retryBackend.List(context.TODO(), backend.PackFile, func(fi backend.FileInfo) error {
|
||||
|
@ -207,8 +208,9 @@ func TestBackendListRetryErrorBackend(t *testing.T) {
|
|||
t.Fatalf("wrong error returned, want %v, got %v", ErrBackendTest, err)
|
||||
}
|
||||
|
||||
if retries != maxRetries+1 {
|
||||
t.Fatalf("List was called %d times, wanted %v", retries, maxRetries+1)
|
||||
duration := time.Since(now)
|
||||
if duration > 100*time.Millisecond {
|
||||
t.Fatalf("list retries took %v, expected at most 10ms", duration)
|
||||
}
|
||||
|
||||
test.Equals(t, names[:2], listed)
|
||||
|
@ -327,7 +329,7 @@ func TestBackendLoadCircuitBreaker(t *testing.T) {
|
|||
// trip the circuit breaker for file "other"
|
||||
err := retryBackend.Load(context.TODO(), backend.Handle{Name: "other"}, 0, 0, nilRd)
|
||||
test.Equals(t, otherError, err, "unexpected error")
|
||||
test.Equals(t, 3, attempt)
|
||||
test.Equals(t, 2, attempt)
|
||||
|
||||
attempt = 0
|
||||
err = retryBackend.Load(context.TODO(), backend.Handle{Name: "other"}, 0, 0, nilRd)
|
||||
|
@ -407,7 +409,7 @@ func TestBackendRetryPermanent(t *testing.T) {
|
|||
return errors.New("something")
|
||||
})
|
||||
test.Assert(t, !be.IsPermanentErrorFn(err), "error unexpectedly considered permanent %v", err)
|
||||
test.Equals(t, 3, attempt)
|
||||
test.Equals(t, 2, attempt)
|
||||
|
||||
}
|
||||
|
||||
|
@ -497,3 +499,64 @@ func TestNotifyWithSuccessIsCalled(t *testing.T) {
|
|||
t.Fatalf("Success should have been called only once, but was called %d times instead", successCalled)
|
||||
}
|
||||
}
|
||||
|
||||
func TestNotifyWithSuccessFinalError(t *testing.T) {
|
||||
operation := func() error {
|
||||
return errors.New("expected error in test")
|
||||
}
|
||||
|
||||
notifyCalled := 0
|
||||
notify := func(error, time.Duration) {
|
||||
notifyCalled++
|
||||
}
|
||||
|
||||
successCalled := 0
|
||||
success := func(retries int) {
|
||||
successCalled++
|
||||
}
|
||||
|
||||
err := retryNotifyErrorWithSuccess(operation, backoff.WithMaxRetries(&backoff.ZeroBackOff{}, 5), notify, success)
|
||||
test.Assert(t, err.Error() == "expected error in test", "wrong error message %v", err)
|
||||
test.Equals(t, 6, notifyCalled, "notify should have been called 6 times")
|
||||
test.Equals(t, 0, successCalled, "success should not have been called")
|
||||
}
|
||||
|
||||
type testClock struct {
|
||||
Time time.Time
|
||||
}
|
||||
|
||||
func (c *testClock) Now() time.Time {
|
||||
return c.Time
|
||||
}
|
||||
|
||||
func TestRetryAtLeastOnce(t *testing.T) {
|
||||
expBackOff := backoff.NewExponentialBackOff()
|
||||
expBackOff.InitialInterval = 500 * time.Millisecond
|
||||
expBackOff.RandomizationFactor = 0
|
||||
expBackOff.MaxElapsedTime = 5 * time.Second
|
||||
expBackOff.Multiplier = 2 // guarantee numerical stability
|
||||
clock := &testClock{Time: time.Now()}
|
||||
expBackOff.Clock = clock
|
||||
expBackOff.Reset()
|
||||
|
||||
retry := withRetryAtLeastOnce(expBackOff)
|
||||
|
||||
// expire backoff
|
||||
clock.Time = clock.Time.Add(10 * time.Second)
|
||||
delay := retry.NextBackOff()
|
||||
test.Equals(t, expBackOff.InitialInterval, delay, "must retry at least once")
|
||||
|
||||
delay = retry.NextBackOff()
|
||||
test.Equals(t, expBackOff.Stop, delay, "must not retry more than once")
|
||||
|
||||
// test reset behavior
|
||||
retry.Reset()
|
||||
test.Equals(t, uint64(0), retry.numTries, "numTries should be reset to 0")
|
||||
|
||||
// Verify that after reset, NextBackOff returns the initial interval again
|
||||
delay = retry.NextBackOff()
|
||||
test.Equals(t, expBackOff.InitialInterval, delay, "retries must work after reset")
|
||||
|
||||
delay = retry.NextBackOff()
|
||||
test.Equals(t, expBackOff.InitialInterval*time.Duration(expBackOff.Multiplier), delay, "retries must work after reset")
|
||||
}
|
||||
|
|
|
@ -132,7 +132,7 @@ func (l *locker) refreshLocks(ctx context.Context, backend backend.Backend, lock
|
|||
|
||||
// remove the lock from the repo
|
||||
debug.Log("unlocking repository with lock %v", lock)
|
||||
if err := lock.Unlock(); err != nil {
|
||||
if err := lock.Unlock(ctx); err != nil {
|
||||
debug.Log("error while unlocking: %v", err)
|
||||
logger("error while unlocking: %v", err)
|
||||
}
|
||||
|
|
|
@ -17,6 +17,10 @@ import (
|
|||
"github.com/restic/restic/internal/debug"
|
||||
)
|
||||
|
||||
// UnlockCancelDelay bounds the duration how long lock cleanup operations will wait
|
||||
// if the passed in context was canceled.
|
||||
const UnlockCancelDelay time.Duration = 1 * time.Minute
|
||||
|
||||
// Lock represents a process locking the repository for an operation.
|
||||
//
|
||||
// There are two types of locks: exclusive and non-exclusive. There may be many
|
||||
|
@ -136,7 +140,7 @@ func newLock(ctx context.Context, repo Unpacked, excl bool) (*Lock, error) {
|
|||
time.Sleep(waitBeforeLockCheck)
|
||||
|
||||
if err = lock.checkForOtherLocks(ctx); err != nil {
|
||||
_ = lock.Unlock()
|
||||
_ = lock.Unlock(ctx)
|
||||
return nil, err
|
||||
}
|
||||
|
||||
|
@ -220,12 +224,15 @@ func (l *Lock) createLock(ctx context.Context) (ID, error) {
|
|||
}
|
||||
|
||||
// Unlock removes the lock from the repository.
|
||||
func (l *Lock) Unlock() error {
|
||||
func (l *Lock) Unlock(ctx context.Context) error {
|
||||
if l == nil || l.lockID == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
return l.repo.RemoveUnpacked(context.TODO(), LockFile, *l.lockID)
|
||||
ctx, cancel := delayedCancelContext(ctx, UnlockCancelDelay)
|
||||
defer cancel()
|
||||
|
||||
return l.repo.RemoveUnpacked(ctx, LockFile, *l.lockID)
|
||||
}
|
||||
|
||||
var StaleLockTimeout = 30 * time.Minute
|
||||
|
@ -266,6 +273,23 @@ func (l *Lock) Stale() bool {
|
|||
return false
|
||||
}
|
||||
|
||||
func delayedCancelContext(parentCtx context.Context, delay time.Duration) (context.Context, context.CancelFunc) {
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
|
||||
go func() {
|
||||
select {
|
||||
case <-parentCtx.Done():
|
||||
case <-ctx.Done():
|
||||
return
|
||||
}
|
||||
|
||||
time.Sleep(delay)
|
||||
cancel()
|
||||
}()
|
||||
|
||||
return ctx, cancel
|
||||
}
|
||||
|
||||
// Refresh refreshes the lock by creating a new file in the backend with a new
|
||||
// timestamp. Afterwards the old lock is removed.
|
||||
func (l *Lock) Refresh(ctx context.Context) error {
|
||||
|
@ -285,7 +309,10 @@ func (l *Lock) Refresh(ctx context.Context) error {
|
|||
oldLockID := l.lockID
|
||||
l.lockID = &id
|
||||
|
||||
return l.repo.RemoveUnpacked(context.TODO(), LockFile, *oldLockID)
|
||||
ctx, cancel := delayedCancelContext(ctx, UnlockCancelDelay)
|
||||
defer cancel()
|
||||
|
||||
return l.repo.RemoveUnpacked(ctx, LockFile, *oldLockID)
|
||||
}
|
||||
|
||||
// RefreshStaleLock is an extended variant of Refresh that can also refresh stale lock files.
|
||||
|
@ -312,15 +339,19 @@ func (l *Lock) RefreshStaleLock(ctx context.Context) error {
|
|||
time.Sleep(waitBeforeLockCheck)
|
||||
|
||||
exists, err = l.checkExistence(ctx)
|
||||
|
||||
ctx, cancel := delayedCancelContext(ctx, UnlockCancelDelay)
|
||||
defer cancel()
|
||||
|
||||
if err != nil {
|
||||
// cleanup replacement lock
|
||||
_ = l.repo.RemoveUnpacked(context.TODO(), LockFile, id)
|
||||
_ = l.repo.RemoveUnpacked(ctx, LockFile, id)
|
||||
return err
|
||||
}
|
||||
|
||||
if !exists {
|
||||
// cleanup replacement lock
|
||||
_ = l.repo.RemoveUnpacked(context.TODO(), LockFile, id)
|
||||
_ = l.repo.RemoveUnpacked(ctx, LockFile, id)
|
||||
return ErrRemovedLock
|
||||
}
|
||||
|
||||
|
@ -331,7 +362,7 @@ func (l *Lock) RefreshStaleLock(ctx context.Context) error {
|
|||
oldLockID := l.lockID
|
||||
l.lockID = &id
|
||||
|
||||
return l.repo.RemoveUnpacked(context.TODO(), LockFile, *oldLockID)
|
||||
return l.repo.RemoveUnpacked(ctx, LockFile, *oldLockID)
|
||||
}
|
||||
|
||||
func (l *Lock) checkExistence(ctx context.Context) (bool, error) {
|
||||
|
|
|
@ -22,7 +22,7 @@ func TestLock(t *testing.T) {
|
|||
lock, err := restic.NewLock(context.TODO(), repo)
|
||||
rtest.OK(t, err)
|
||||
|
||||
rtest.OK(t, lock.Unlock())
|
||||
rtest.OK(t, lock.Unlock(context.TODO()))
|
||||
}
|
||||
|
||||
func TestDoubleUnlock(t *testing.T) {
|
||||
|
@ -32,9 +32,9 @@ func TestDoubleUnlock(t *testing.T) {
|
|||
lock, err := restic.NewLock(context.TODO(), repo)
|
||||
rtest.OK(t, err)
|
||||
|
||||
rtest.OK(t, lock.Unlock())
|
||||
rtest.OK(t, lock.Unlock(context.TODO()))
|
||||
|
||||
err = lock.Unlock()
|
||||
err = lock.Unlock(context.TODO())
|
||||
rtest.Assert(t, err != nil,
|
||||
"double unlock didn't return an error, got %v", err)
|
||||
}
|
||||
|
@ -49,8 +49,8 @@ func TestMultipleLock(t *testing.T) {
|
|||
lock2, err := restic.NewLock(context.TODO(), repo)
|
||||
rtest.OK(t, err)
|
||||
|
||||
rtest.OK(t, lock1.Unlock())
|
||||
rtest.OK(t, lock2.Unlock())
|
||||
rtest.OK(t, lock1.Unlock(context.TODO()))
|
||||
rtest.OK(t, lock2.Unlock(context.TODO()))
|
||||
}
|
||||
|
||||
type failLockLoadingBackend struct {
|
||||
|
@ -75,7 +75,7 @@ func TestMultipleLockFailure(t *testing.T) {
|
|||
_, err = restic.NewLock(context.TODO(), repo)
|
||||
rtest.Assert(t, err != nil, "unreadable lock file did not result in an error")
|
||||
|
||||
rtest.OK(t, lock1.Unlock())
|
||||
rtest.OK(t, lock1.Unlock(context.TODO()))
|
||||
}
|
||||
|
||||
func TestLockExclusive(t *testing.T) {
|
||||
|
@ -83,7 +83,7 @@ func TestLockExclusive(t *testing.T) {
|
|||
|
||||
elock, err := restic.NewExclusiveLock(context.TODO(), repo)
|
||||
rtest.OK(t, err)
|
||||
rtest.OK(t, elock.Unlock())
|
||||
rtest.OK(t, elock.Unlock(context.TODO()))
|
||||
}
|
||||
|
||||
func TestLockOnExclusiveLockedRepo(t *testing.T) {
|
||||
|
@ -99,8 +99,8 @@ func TestLockOnExclusiveLockedRepo(t *testing.T) {
|
|||
rtest.Assert(t, restic.IsAlreadyLocked(err),
|
||||
"create normal lock with exclusively locked repo didn't return the correct error")
|
||||
|
||||
rtest.OK(t, lock.Unlock())
|
||||
rtest.OK(t, elock.Unlock())
|
||||
rtest.OK(t, lock.Unlock(context.TODO()))
|
||||
rtest.OK(t, elock.Unlock(context.TODO()))
|
||||
}
|
||||
|
||||
func TestExclusiveLockOnLockedRepo(t *testing.T) {
|
||||
|
@ -116,8 +116,8 @@ func TestExclusiveLockOnLockedRepo(t *testing.T) {
|
|||
rtest.Assert(t, restic.IsAlreadyLocked(err),
|
||||
"create normal lock with exclusively locked repo didn't return the correct error")
|
||||
|
||||
rtest.OK(t, lock.Unlock())
|
||||
rtest.OK(t, elock.Unlock())
|
||||
rtest.OK(t, lock.Unlock(context.TODO()))
|
||||
rtest.OK(t, elock.Unlock(context.TODO()))
|
||||
}
|
||||
|
||||
func createFakeLock(repo restic.SaverUnpacked, t time.Time, pid int) (restic.ID, error) {
|
||||
|
@ -296,7 +296,7 @@ func testLockRefresh(t *testing.T, refresh func(lock *restic.Lock) error) {
|
|||
rtest.OK(t, err)
|
||||
rtest.Assert(t, lock2.Time.After(time0),
|
||||
"expected a later timestamp after lock refresh")
|
||||
rtest.OK(t, lock.Unlock())
|
||||
rtest.OK(t, lock.Unlock(context.TODO()))
|
||||
}
|
||||
|
||||
func TestLockRefresh(t *testing.T) {
|
||||
|
|
Loading…
Reference in a new issue