restic/internal/backend/retry/backend_retry.go
Michael Eischer 38654a3bd7 backend/retry: do not log final error if context was canceled
Calls to `List(ctx, ...)` are usually stopped by canceling the context
once no further entries are required by the caller. Thus, don't log the
final error if the used context was canceled.
2024-05-30 18:48:52 +02:00

283 lines
8.5 KiB
Go

package retry
import (
"context"
"errors"
"fmt"
"io"
"sync"
"time"
"github.com/cenkalti/backoff/v4"
"github.com/restic/restic/internal/backend"
"github.com/restic/restic/internal/debug"
"github.com/restic/restic/internal/feature"
)
// Backend retries operations on the backend in case of an error with a
// backoff.
type Backend struct {
backend.Backend
MaxElapsedTime time.Duration
Report func(string, error, time.Duration)
Success func(string, int)
failedLoads sync.Map
}
// statically ensure that RetryBackend implements backend.Backend.
var _ backend.Backend = &Backend{}
// New wraps be with a backend that retries operations after a
// backoff. report is called with a description and the error, if one occurred.
// success is called with the number of retries before a successful operation
// (it is not called if it succeeded on the first try)
func New(be backend.Backend, maxElapsedTime time.Duration, report func(string, error, time.Duration), success func(string, int)) *Backend {
return &Backend{
Backend: be,
MaxElapsedTime: maxElapsedTime,
Report: report,
Success: success,
}
}
// retryNotifyErrorWithSuccess is an extension of backoff.RetryNotify with notification of success after an error.
// success is NOT notified on the first run of operation (only after an error).
func retryNotifyErrorWithSuccess(operation backoff.Operation, b backoff.BackOffContext, notify backoff.Notify, success func(retries int)) error {
var operationWrapper backoff.Operation
if success == nil {
operationWrapper = operation
} else {
retries := 0
operationWrapper = func() error {
err := operation()
if err != nil {
retries++
} else if retries > 0 {
success(retries)
}
return err
}
}
err := backoff.RetryNotify(operationWrapper, b, notify)
if err != nil && notify != nil && b.Context().Err() == nil {
// log final error, unless the context was canceled
notify(err, -1)
}
return err
}
func withRetryAtLeastOnce(delegate *backoff.ExponentialBackOff) *retryAtLeastOnce {
return &retryAtLeastOnce{delegate: delegate}
}
type retryAtLeastOnce struct {
delegate *backoff.ExponentialBackOff
numTries uint64
}
func (b *retryAtLeastOnce) NextBackOff() time.Duration {
delay := b.delegate.NextBackOff()
b.numTries++
if b.numTries == 1 && b.delegate.Stop == delay {
return b.delegate.InitialInterval
}
return delay
}
func (b *retryAtLeastOnce) Reset() {
b.numTries = 0
b.delegate.Reset()
}
var fastRetries = false
func (be *Backend) retry(ctx context.Context, msg string, f func() error) error {
// Don't do anything when called with an already cancelled context. There would be
// no retries in that case either, so be consistent and abort always.
// This enforces a strict contract for backend methods: Using a cancelled context
// will prevent any backup repository modifications. This simplifies ensuring that
// a backup repository is not modified any further after a context was cancelled.
// The 'local' backend for example does not provide this guarantee on its own.
if ctx.Err() != nil {
return ctx.Err()
}
bo := backoff.NewExponentialBackOff()
bo.MaxElapsedTime = be.MaxElapsedTime
if feature.Flag.Enabled(feature.BackendErrorRedesign) {
bo.InitialInterval = 1 * time.Second
bo.Multiplier = 2
}
if fastRetries {
// speed up integration tests
bo.InitialInterval = 1 * time.Millisecond
maxElapsedTime := 200 * time.Millisecond
if bo.MaxElapsedTime > maxElapsedTime {
bo.MaxElapsedTime = maxElapsedTime
}
}
var b backoff.BackOff = withRetryAtLeastOnce(bo)
if !feature.Flag.Enabled(feature.BackendErrorRedesign) {
// deprecated behavior
b = backoff.WithMaxRetries(b, 10)
}
err := retryNotifyErrorWithSuccess(
func() error {
err := f()
// don't retry permanent errors as those very likely cannot be fixed by retrying
// TODO remove IsNotExist(err) special cases when removing the feature flag
if feature.Flag.Enabled(feature.BackendErrorRedesign) && !errors.Is(err, &backoff.PermanentError{}) && be.Backend.IsPermanentError(err) {
return backoff.Permanent(err)
}
return err
},
backoff.WithContext(b, ctx),
func(err error, d time.Duration) {
if be.Report != nil {
be.Report(msg, err, d)
}
},
func(retries int) {
if be.Success != nil {
be.Success(msg, retries)
}
},
)
return err
}
// Save stores the data in the backend under the given handle.
func (be *Backend) Save(ctx context.Context, h backend.Handle, rd backend.RewindReader) error {
return be.retry(ctx, fmt.Sprintf("Save(%v)", h), func() error {
err := rd.Rewind()
if err != nil {
return err
}
err = be.Backend.Save(ctx, h, rd)
if err == nil {
return nil
}
if be.Backend.HasAtomicReplace() {
debug.Log("Save(%v) failed with error: %v", h, err)
// there is no need to remove files from backends which can atomically replace files
// in fact if something goes wrong at the backend side the delete operation might delete the wrong instance of the file
} else {
debug.Log("Save(%v) failed with error, removing file: %v", h, err)
rerr := be.Backend.Remove(ctx, h)
if rerr != nil {
debug.Log("Remove(%v) returned error: %v", h, err)
}
}
// return original error
return err
})
}
// Failed loads expire after an hour
var failedLoadExpiry = time.Hour
// Load returns a reader that yields the contents of the file at h at the
// given offset. If length is larger than zero, only a portion of the file
// is returned. rd must be closed after use. If an error is returned, the
// ReadCloser must be nil.
func (be *Backend) Load(ctx context.Context, h backend.Handle, length int, offset int64, consumer func(rd io.Reader) error) (err error) {
key := h
key.IsMetadata = false
// Implement the circuit breaker pattern for files that exhausted all retries due to a non-permanent error
if v, ok := be.failedLoads.Load(key); ok {
if time.Since(v.(time.Time)) > failedLoadExpiry {
be.failedLoads.Delete(key)
} else {
// fail immediately if the file was already problematic during the last hour
return fmt.Errorf("circuit breaker open for file %v", h)
}
}
err = be.retry(ctx, fmt.Sprintf("Load(%v, %v, %v)", h, length, offset),
func() error {
return be.Backend.Load(ctx, h, length, offset, consumer)
})
if feature.Flag.Enabled(feature.BackendErrorRedesign) && err != nil && !be.IsPermanentError(err) {
// We've exhausted the retries, the file is likely inaccessible. By excluding permanent
// errors, not found or truncated files are not recorded.
be.failedLoads.LoadOrStore(key, time.Now())
}
return err
}
// Stat returns information about the File identified by h.
func (be *Backend) Stat(ctx context.Context, h backend.Handle) (fi backend.FileInfo, err error) {
err = be.retry(ctx, fmt.Sprintf("Stat(%v)", h),
func() error {
var innerError error
fi, innerError = be.Backend.Stat(ctx, h)
if be.Backend.IsNotExist(innerError) {
// do not retry if file is not found, as stat is usually used to check whether a file exists
return backoff.Permanent(innerError)
}
return innerError
})
return fi, err
}
// Remove removes a File with type t and name.
func (be *Backend) Remove(ctx context.Context, h backend.Handle) (err error) {
return be.retry(ctx, fmt.Sprintf("Remove(%v)", h), func() error {
return be.Backend.Remove(ctx, h)
})
}
// List runs fn for each file in the backend which has the type t. When an
// error is returned by the underlying backend, the request is retried. When fn
// returns an error, the operation is aborted and the error is returned to the
// caller.
func (be *Backend) List(ctx context.Context, t backend.FileType, fn func(backend.FileInfo) error) error {
// create a new context that we can cancel when fn returns an error, so
// that listing is aborted
listCtx, cancel := context.WithCancel(ctx)
defer cancel()
listed := make(map[string]struct{}) // remember for which files we already ran fn
var innerErr error // remember when fn returned an error, so we can return that to the caller
err := be.retry(listCtx, fmt.Sprintf("List(%v)", t), func() error {
return be.Backend.List(ctx, t, func(fi backend.FileInfo) error {
if _, ok := listed[fi.Name]; ok {
return nil
}
listed[fi.Name] = struct{}{}
innerErr = fn(fi)
if innerErr != nil {
// if fn returned an error, listing is aborted, so we cancel the context
cancel()
}
return innerErr
})
})
// the error fn returned takes precedence
if innerErr != nil {
return innerErr
}
return err
}
func (be *Backend) Unwrap() backend.Backend {
return be.Backend
}