From 2296fdf668468797b3ca1aa3be8b5dfeba1a9780 Mon Sep 17 00:00:00 2001 From: Michael Eischer Date: Fri, 23 Aug 2024 23:24:43 +0200 Subject: [PATCH] lock: introduce short delay between failed locking retries Failed locking attempts were immediately retried up to three times without any delay between the retries. If a lock file is not found while checking for other locks, with the reworked backend retries there is no delay between those retries. This is a problem if a backend requires a few seconds to reflect file deletions in the file listings. To work around this problem, introduce a short exponentially increasing delay between the retries. The number of retries is now increased to 4. This results in delays of 5, 10 and 20 seconds between the retries. --- .../unreleased/{pull-5011 => issue-5005} | 3 +++ internal/restic/lock.go | 27 ++++++++++++++++++- 2 files changed, 29 insertions(+), 1 deletion(-) rename changelog/unreleased/{pull-5011 => issue-5005} (76%) diff --git a/changelog/unreleased/pull-5011 b/changelog/unreleased/issue-5005 similarity index 76% rename from changelog/unreleased/pull-5011 rename to changelog/unreleased/issue-5005 index 8bd5ef532..90c164b07 100644 --- a/changelog/unreleased/pull-5011 +++ b/changelog/unreleased/issue-5005 @@ -5,6 +5,9 @@ one of the lock files failed to load. The lock operation failed with error `unable to create lock in backend: circuit breaker open for file ` The error handling has been fixed to correctly retry locking the repository. +In addition, restic now waits a few seconds between locking retries to +increase chances of success. https://github.com/restic/restic/issues/5005 https://github.com/restic/restic/pull/5011 +https://github.com/restic/restic/pull/5012 diff --git a/internal/restic/lock.go b/internal/restic/lock.go index 49c7cedf2..969d0593d 100644 --- a/internal/restic/lock.go +++ b/internal/restic/lock.go @@ -103,10 +103,14 @@ func NewExclusiveLock(ctx context.Context, repo Unpacked) (*Lock, error) { var waitBeforeLockCheck = 200 * time.Millisecond +// delay increases by factor 2 on each retry +var initialWaitBetweenLockRetries = 5 * time.Second + // TestSetLockTimeout can be used to reduce the lock wait timeout for tests. func TestSetLockTimeout(t testing.TB, d time.Duration) { t.Logf("setting lock timeout to %v", d) waitBeforeLockCheck = d + initialWaitBetweenLockRetries = d } func newLock(ctx context.Context, repo Unpacked, excl bool) (*Lock, error) { @@ -170,8 +174,17 @@ func (l *Lock) checkForOtherLocks(ctx context.Context) error { if l.lockID != nil { checkedIDs.Insert(*l.lockID) } + delay := initialWaitBetweenLockRetries // retry locking a few times - for i := 0; i < 3; i++ { + for i := 0; i < 4; i++ { + if i != 0 { + // sleep between retries to give backend some time to settle + if err := cancelableDelay(ctx, delay); err != nil { + return err + } + delay *= 2 + } + // Store updates in new IDSet to prevent data races var m sync.Mutex newCheckedIDs := NewIDSet(checkedIDs.List()...) @@ -213,6 +226,18 @@ func (l *Lock) checkForOtherLocks(ctx context.Context) error { return err } +func cancelableDelay(ctx context.Context, delay time.Duration) error { + // delay next try a bit + timer := time.NewTimer(delay) + select { + case <-ctx.Done(): + timer.Stop() + return ctx.Err() + case <-timer.C: + } + return nil +} + // createLock acquires the lock by creating a file in the repository. func (l *Lock) createLock(ctx context.Context) (ID, error) { id, err := SaveJSONUnpacked(ctx, l.repo, LockFile, l)