From 2296fdf668468797b3ca1aa3be8b5dfeba1a9780 Mon Sep 17 00:00:00 2001
From: Michael Eischer <michael.eischer@fau.de>
Date: Fri, 23 Aug 2024 23:24:43 +0200
Subject: [PATCH] lock: introduce short delay between failed locking retries

Failed locking attempts were immediately retried up to three times
without any delay between the retries. If a lock file is not found while
checking for other locks, with the reworked backend retries there is no
delay between those retries. This is a problem if a backend requires a
few seconds to reflect file deletions in the file listings. To work
around this problem, introduce a short exponentially increasing delay
between the retries. The number of retries is now increased to 4. This
results in delays of 5, 10 and 20 seconds between the retries.
---
 .../unreleased/{pull-5011 => issue-5005}      |  3 +++
 internal/restic/lock.go                       | 27 ++++++++++++++++++-
 2 files changed, 29 insertions(+), 1 deletion(-)
 rename changelog/unreleased/{pull-5011 => issue-5005} (76%)

diff --git a/changelog/unreleased/pull-5011 b/changelog/unreleased/issue-5005
similarity index 76%
rename from changelog/unreleased/pull-5011
rename to changelog/unreleased/issue-5005
index 8bd5ef532..90c164b07 100644
--- a/changelog/unreleased/pull-5011
+++ b/changelog/unreleased/issue-5005
@@ -5,6 +5,9 @@ one of the lock files failed to load. The lock operation failed with error
 `unable to create lock in backend: circuit breaker open for file <lock/1234567890>`
 
 The error handling has been fixed to correctly retry locking the repository.
+In addition, restic now waits a few seconds between locking retries to
+increase chances of success.
 
 https://github.com/restic/restic/issues/5005
 https://github.com/restic/restic/pull/5011
+https://github.com/restic/restic/pull/5012
diff --git a/internal/restic/lock.go b/internal/restic/lock.go
index 49c7cedf2..969d0593d 100644
--- a/internal/restic/lock.go
+++ b/internal/restic/lock.go
@@ -103,10 +103,14 @@ func NewExclusiveLock(ctx context.Context, repo Unpacked) (*Lock, error) {
 
 var waitBeforeLockCheck = 200 * time.Millisecond
 
+// delay increases by factor 2 on each retry
+var initialWaitBetweenLockRetries = 5 * time.Second
+
 // TestSetLockTimeout can be used to reduce the lock wait timeout for tests.
 func TestSetLockTimeout(t testing.TB, d time.Duration) {
 	t.Logf("setting lock timeout to %v", d)
 	waitBeforeLockCheck = d
+	initialWaitBetweenLockRetries = d
 }
 
 func newLock(ctx context.Context, repo Unpacked, excl bool) (*Lock, error) {
@@ -170,8 +174,17 @@ func (l *Lock) checkForOtherLocks(ctx context.Context) error {
 	if l.lockID != nil {
 		checkedIDs.Insert(*l.lockID)
 	}
+	delay := initialWaitBetweenLockRetries
 	// retry locking a few times
-	for i := 0; i < 3; i++ {
+	for i := 0; i < 4; i++ {
+		if i != 0 {
+			// sleep between retries to give backend some time to settle
+			if err := cancelableDelay(ctx, delay); err != nil {
+				return err
+			}
+			delay *= 2
+		}
+
 		// Store updates in new IDSet to prevent data races
 		var m sync.Mutex
 		newCheckedIDs := NewIDSet(checkedIDs.List()...)
@@ -213,6 +226,18 @@ func (l *Lock) checkForOtherLocks(ctx context.Context) error {
 	return err
 }
 
+func cancelableDelay(ctx context.Context, delay time.Duration) error {
+	// delay next try a bit
+	timer := time.NewTimer(delay)
+	select {
+	case <-ctx.Done():
+		timer.Stop()
+		return ctx.Err()
+	case <-timer.C:
+	}
+	return nil
+}
+
 // createLock acquires the lock by creating a file in the repository.
 func (l *Lock) createLock(ctx context.Context) (ID, error) {
 	id, err := SaveJSONUnpacked(ctx, l.repo, LockFile, l)