lock: introduce short delay between failed locking retries

Failed locking attempts were immediately retried up to three times
without any delay between the retries. If a lock file is not found while
checking for other locks, with the reworked backend retries there is no
delay between those retries. This is a problem if a backend requires a
few seconds to reflect file deletions in the file listings. To work
around this problem, introduce a short exponentially increasing delay
between the retries. The number of retries is now increased to 4. This
results in delays of 5, 10 and 20 seconds between the retries.
This commit is contained in:
Michael Eischer 2024-08-23 23:24:43 +02:00
parent 8206cd19c8
commit 6eece31dc3
2 changed files with 29 additions and 1 deletions

View file

@ -5,6 +5,9 @@ one of the lock files failed to load. The lock operation failed with error
`unable to create lock in backend: circuit breaker open for file <lock/1234567890>` `unable to create lock in backend: circuit breaker open for file <lock/1234567890>`
The error handling has been fixed to correctly retry locking the repository. The error handling has been fixed to correctly retry locking the repository.
In addition, restic now waits a few seconds between locking retries to
increase chances of success.
https://github.com/restic/restic/issues/5005 https://github.com/restic/restic/issues/5005
https://github.com/restic/restic/pull/5011 https://github.com/restic/restic/pull/5011
https://github.com/restic/restic/pull/5012

View file

@ -103,10 +103,14 @@ func NewExclusiveLock(ctx context.Context, repo Unpacked) (*Lock, error) {
var waitBeforeLockCheck = 200 * time.Millisecond var waitBeforeLockCheck = 200 * time.Millisecond
// delay increases by factor 2 on each retry
var initialWaitBetweenLockRetries = 5 * time.Second
// TestSetLockTimeout can be used to reduce the lock wait timeout for tests. // TestSetLockTimeout can be used to reduce the lock wait timeout for tests.
func TestSetLockTimeout(t testing.TB, d time.Duration) { func TestSetLockTimeout(t testing.TB, d time.Duration) {
t.Logf("setting lock timeout to %v", d) t.Logf("setting lock timeout to %v", d)
waitBeforeLockCheck = d waitBeforeLockCheck = d
initialWaitBetweenLockRetries = d
} }
func newLock(ctx context.Context, repo Unpacked, excl bool) (*Lock, error) { func newLock(ctx context.Context, repo Unpacked, excl bool) (*Lock, error) {
@ -170,8 +174,17 @@ func (l *Lock) checkForOtherLocks(ctx context.Context) error {
if l.lockID != nil { if l.lockID != nil {
checkedIDs.Insert(*l.lockID) checkedIDs.Insert(*l.lockID)
} }
delay := initialWaitBetweenLockRetries
// retry locking a few times // retry locking a few times
for i := 0; i < 3; i++ { for i := 0; i < 4; i++ {
if i != 0 {
// sleep between retries to give backend some time to settle
if err := cancelableDelay(ctx, delay); err != nil {
return err
}
delay *= 2
}
// Store updates in new IDSet to prevent data races // Store updates in new IDSet to prevent data races
var m sync.Mutex var m sync.Mutex
newCheckedIDs := NewIDSet(checkedIDs.List()...) newCheckedIDs := NewIDSet(checkedIDs.List()...)
@ -213,6 +226,18 @@ func (l *Lock) checkForOtherLocks(ctx context.Context) error {
return err return err
} }
func cancelableDelay(ctx context.Context, delay time.Duration) error {
// delay next try a bit
timer := time.NewTimer(delay)
select {
case <-ctx.Done():
timer.Stop()
return ctx.Err()
case <-timer.C:
}
return nil
}
// createLock acquires the lock by creating a file in the repository. // createLock acquires the lock by creating a file in the repository.
func (l *Lock) createLock(ctx context.Context) (ID, error) { func (l *Lock) createLock(ctx context.Context) (ID, error) {
id, err := SaveJSONUnpacked(ctx, l.repo, LockFile, l) id, err := SaveJSONUnpacked(ctx, l.repo, LockFile, l)