forked from TrueCloudLab/restic
lock: introduce short delay between failed locking retries
Failed locking attempts were immediately retried up to three times without any delay between the retries. If a lock file is not found while checking for other locks, with the reworked backend retries there is no delay between those retries. This is a problem if a backend requires a few seconds to reflect file deletions in the file listings. To work around this problem, introduce a short exponentially increasing delay between the retries. The number of retries is now increased to 4. This results in delays of 5, 10 and 20 seconds between the retries.
This commit is contained in:
parent
8206cd19c8
commit
6eece31dc3
2 changed files with 29 additions and 1 deletions
|
@ -5,6 +5,9 @@ one of the lock files failed to load. The lock operation failed with error
|
|||
`unable to create lock in backend: circuit breaker open for file <lock/1234567890>`
|
||||
|
||||
The error handling has been fixed to correctly retry locking the repository.
|
||||
In addition, restic now waits a few seconds between locking retries to
|
||||
increase chances of success.
|
||||
|
||||
https://github.com/restic/restic/issues/5005
|
||||
https://github.com/restic/restic/pull/5011
|
||||
https://github.com/restic/restic/pull/5012
|
|
@ -103,10 +103,14 @@ func NewExclusiveLock(ctx context.Context, repo Unpacked) (*Lock, error) {
|
|||
|
||||
var waitBeforeLockCheck = 200 * time.Millisecond
|
||||
|
||||
// delay increases by factor 2 on each retry
|
||||
var initialWaitBetweenLockRetries = 5 * time.Second
|
||||
|
||||
// TestSetLockTimeout can be used to reduce the lock wait timeout for tests.
|
||||
func TestSetLockTimeout(t testing.TB, d time.Duration) {
|
||||
t.Logf("setting lock timeout to %v", d)
|
||||
waitBeforeLockCheck = d
|
||||
initialWaitBetweenLockRetries = d
|
||||
}
|
||||
|
||||
func newLock(ctx context.Context, repo Unpacked, excl bool) (*Lock, error) {
|
||||
|
@ -170,8 +174,17 @@ func (l *Lock) checkForOtherLocks(ctx context.Context) error {
|
|||
if l.lockID != nil {
|
||||
checkedIDs.Insert(*l.lockID)
|
||||
}
|
||||
delay := initialWaitBetweenLockRetries
|
||||
// retry locking a few times
|
||||
for i := 0; i < 3; i++ {
|
||||
for i := 0; i < 4; i++ {
|
||||
if i != 0 {
|
||||
// sleep between retries to give backend some time to settle
|
||||
if err := cancelableDelay(ctx, delay); err != nil {
|
||||
return err
|
||||
}
|
||||
delay *= 2
|
||||
}
|
||||
|
||||
// Store updates in new IDSet to prevent data races
|
||||
var m sync.Mutex
|
||||
newCheckedIDs := NewIDSet(checkedIDs.List()...)
|
||||
|
@ -213,6 +226,18 @@ func (l *Lock) checkForOtherLocks(ctx context.Context) error {
|
|||
return err
|
||||
}
|
||||
|
||||
func cancelableDelay(ctx context.Context, delay time.Duration) error {
|
||||
// delay next try a bit
|
||||
timer := time.NewTimer(delay)
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
timer.Stop()
|
||||
return ctx.Err()
|
||||
case <-timer.C:
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// createLock acquires the lock by creating a file in the repository.
|
||||
func (l *Lock) createLock(ctx context.Context) (ID, error) {
|
||||
id, err := SaveJSONUnpacked(ctx, l.repo, LockFile, l)
|
||||
|
|
Loading…
Reference in a new issue