forked from TrueCloudLab/restic
Merge pull request #4626 from MichaelEischer/reliable-large-restores
Improve reliability of large restores
This commit is contained in:
commit
c31e9418ba
3 changed files with 150 additions and 77 deletions
11
changelog/unreleased/pull-4626
Normal file
11
changelog/unreleased/pull-4626
Normal file
|
@ -0,0 +1,11 @@
|
|||
Bugfix: Improve reliability of restoring large files
|
||||
|
||||
In some cases restic failed to restore large files that frequently contain the
|
||||
same file chunk. In combination with certain backends, this could result in
|
||||
network connection timeouts that caused incomplete restores.
|
||||
|
||||
Restic now includes special handling for such file chunks to ensure reliable
|
||||
restores.
|
||||
|
||||
https://github.com/restic/restic/pull/4626
|
||||
https://forum.restic.net/t/errors-restoring-with-restic-on-windows-server-s3/6943
|
|
@ -197,19 +197,20 @@ func (r *fileRestorer) restoreFiles(ctx context.Context) error {
|
|||
return wg.Wait()
|
||||
}
|
||||
|
||||
func (r *fileRestorer) downloadPack(ctx context.Context, pack *packInfo) error {
|
||||
type blobToFileOffsetsMapping map[restic.ID]struct {
|
||||
files map[*fileInfo][]int64 // file -> offsets (plural!) of the blob in the file
|
||||
blob restic.Blob
|
||||
}
|
||||
|
||||
func (r *fileRestorer) downloadPack(ctx context.Context, pack *packInfo) error {
|
||||
// calculate blob->[]files->[]offsets mappings
|
||||
blobs := make(map[restic.ID]struct {
|
||||
files map[*fileInfo][]int64 // file -> offsets (plural!) of the blob in the file
|
||||
})
|
||||
var blobList []restic.Blob
|
||||
blobs := make(blobToFileOffsetsMapping)
|
||||
for file := range pack.files {
|
||||
addBlob := func(blob restic.Blob, fileOffset int64) {
|
||||
blobInfo, ok := blobs[blob.ID]
|
||||
if !ok {
|
||||
blobInfo.files = make(map[*fileInfo][]int64)
|
||||
blobList = append(blobList, blob)
|
||||
blobInfo.blob = blob
|
||||
blobs[blob.ID] = blobInfo
|
||||
}
|
||||
blobInfo.files[file] = append(blobInfo.files[file], fileOffset)
|
||||
|
@ -239,80 +240,120 @@ func (r *fileRestorer) downloadPack(ctx context.Context, pack *packInfo) error {
|
|||
}
|
||||
}
|
||||
|
||||
sanitizeError := func(file *fileInfo, err error) error {
|
||||
if err != nil {
|
||||
err = r.Error(file.location, err)
|
||||
}
|
||||
return err
|
||||
}
|
||||
|
||||
// track already processed blobs for precise error reporting
|
||||
processedBlobs := restic.NewBlobSet()
|
||||
err := repository.StreamPack(ctx, r.packLoader, r.key, pack.id, blobList, func(h restic.BlobHandle, blobData []byte, err error) error {
|
||||
processedBlobs.Insert(h)
|
||||
blob := blobs[h.ID]
|
||||
if err != nil {
|
||||
for file := range blob.files {
|
||||
if errFile := sanitizeError(file, err); errFile != nil {
|
||||
return errFile
|
||||
for _, entry := range blobs {
|
||||
occurrences := 0
|
||||
for _, offsets := range entry.files {
|
||||
occurrences += len(offsets)
|
||||
}
|
||||
// With a maximum blob size of 8MB, the normal blob streaming has to write
|
||||
// at most 800MB for a single blob. This should be short enough to avoid
|
||||
// network connection timeouts. Based on a quick test, a limit of 100 only
|
||||
// selects a very small number of blobs (the number of references per blob
|
||||
// - aka. `count` - seem to follow a expontential distribution)
|
||||
if occurrences > 100 {
|
||||
// process frequently referenced blobs first as these can take a long time to write
|
||||
// which can cause backend connections to time out
|
||||
delete(blobs, entry.blob.ID)
|
||||
partialBlobs := blobToFileOffsetsMapping{entry.blob.ID: entry}
|
||||
err := r.downloadBlobs(ctx, pack.id, partialBlobs, processedBlobs)
|
||||
if err := r.reportError(blobs, processedBlobs, err); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if len(blobs) == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
err := r.downloadBlobs(ctx, pack.id, blobs, processedBlobs)
|
||||
return r.reportError(blobs, processedBlobs, err)
|
||||
}
|
||||
|
||||
func (r *fileRestorer) sanitizeError(file *fileInfo, err error) error {
|
||||
if err != nil {
|
||||
err = r.Error(file.location, err)
|
||||
}
|
||||
return err
|
||||
}
|
||||
|
||||
func (r *fileRestorer) reportError(blobs blobToFileOffsetsMapping, processedBlobs restic.BlobSet, err error) error {
|
||||
if err == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
// only report error for not yet processed blobs
|
||||
affectedFiles := make(map[*fileInfo]struct{})
|
||||
for _, entry := range blobs {
|
||||
if processedBlobs.Has(entry.blob.BlobHandle) {
|
||||
continue
|
||||
}
|
||||
for file := range entry.files {
|
||||
affectedFiles[file] = struct{}{}
|
||||
}
|
||||
}
|
||||
|
||||
for file := range affectedFiles {
|
||||
if errFile := r.sanitizeError(file, err); errFile != nil {
|
||||
return errFile
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (r *fileRestorer) downloadBlobs(ctx context.Context, packID restic.ID,
|
||||
blobs blobToFileOffsetsMapping, processedBlobs restic.BlobSet) error {
|
||||
|
||||
blobList := make([]restic.Blob, 0, len(blobs))
|
||||
for _, entry := range blobs {
|
||||
blobList = append(blobList, entry.blob)
|
||||
}
|
||||
return repository.StreamPack(ctx, r.packLoader, r.key, packID, blobList,
|
||||
func(h restic.BlobHandle, blobData []byte, err error) error {
|
||||
processedBlobs.Insert(h)
|
||||
blob := blobs[h.ID]
|
||||
if err != nil {
|
||||
for file := range blob.files {
|
||||
if errFile := r.sanitizeError(file, err); errFile != nil {
|
||||
return errFile
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
for file, offsets := range blob.files {
|
||||
for _, offset := range offsets {
|
||||
writeToFile := func() error {
|
||||
// this looks overly complicated and needs explanation
|
||||
// two competing requirements:
|
||||
// - must create the file once and only once
|
||||
// - should allow concurrent writes to the file
|
||||
// so write the first blob while holding file lock
|
||||
// write other blobs after releasing the lock
|
||||
createSize := int64(-1)
|
||||
file.lock.Lock()
|
||||
if file.inProgress {
|
||||
file.lock.Unlock()
|
||||
} else {
|
||||
defer file.lock.Unlock()
|
||||
file.inProgress = true
|
||||
createSize = file.size
|
||||
}
|
||||
writeErr := r.filesWriter.writeToFile(r.targetPath(file.location), blobData, offset, createSize, file.sparse)
|
||||
|
||||
if r.progress != nil {
|
||||
r.progress.AddProgress(file.location, uint64(len(blobData)), uint64(file.size))
|
||||
}
|
||||
|
||||
return writeErr
|
||||
}
|
||||
err := r.sanitizeError(file, writeToFile())
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
for file, offsets := range blob.files {
|
||||
for _, offset := range offsets {
|
||||
writeToFile := func() error {
|
||||
// this looks overly complicated and needs explanation
|
||||
// two competing requirements:
|
||||
// - must create the file once and only once
|
||||
// - should allow concurrent writes to the file
|
||||
// so write the first blob while holding file lock
|
||||
// write other blobs after releasing the lock
|
||||
createSize := int64(-1)
|
||||
file.lock.Lock()
|
||||
if file.inProgress {
|
||||
file.lock.Unlock()
|
||||
} else {
|
||||
defer file.lock.Unlock()
|
||||
file.inProgress = true
|
||||
createSize = file.size
|
||||
}
|
||||
writeErr := r.filesWriter.writeToFile(r.targetPath(file.location), blobData, offset, createSize, file.sparse)
|
||||
|
||||
if r.progress != nil {
|
||||
r.progress.AddProgress(file.location, uint64(len(blobData)), uint64(file.size))
|
||||
}
|
||||
|
||||
return writeErr
|
||||
}
|
||||
err := sanitizeError(file, writeToFile())
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
}
|
||||
return nil
|
||||
})
|
||||
|
||||
if err != nil {
|
||||
// only report error for not yet processed blobs
|
||||
affectedFiles := make(map[*fileInfo]struct{})
|
||||
for _, blob := range blobList {
|
||||
if processedBlobs.Has(blob.BlobHandle) {
|
||||
continue
|
||||
}
|
||||
blob := blobs[blob.ID]
|
||||
for file := range blob.files {
|
||||
affectedFiles[file] = struct{}{}
|
||||
}
|
||||
}
|
||||
|
||||
for file := range affectedFiles {
|
||||
if errFile := sanitizeError(file, err); errFile != nil {
|
||||
return errFile
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
})
|
||||
}
|
||||
|
|
|
@ -248,6 +248,27 @@ func TestFileRestorerPackSkip(t *testing.T) {
|
|||
}
|
||||
}
|
||||
|
||||
func TestFileRestorerFrequentBlob(t *testing.T) {
|
||||
tempdir := rtest.TempDir(t)
|
||||
|
||||
for _, sparse := range []bool{false, true} {
|
||||
blobs := []TestBlob{
|
||||
{"data1-1", "pack1-1"},
|
||||
}
|
||||
for i := 0; i < 10000; i++ {
|
||||
blobs = append(blobs, TestBlob{"a", "pack1-1"})
|
||||
}
|
||||
blobs = append(blobs, TestBlob{"end", "pack1-1"})
|
||||
|
||||
restoreAndVerify(t, tempdir, []TestFile{
|
||||
{
|
||||
name: "file1",
|
||||
blobs: blobs,
|
||||
},
|
||||
}, nil, sparse)
|
||||
}
|
||||
}
|
||||
|
||||
func TestErrorRestoreFiles(t *testing.T) {
|
||||
tempdir := rtest.TempDir(t)
|
||||
content := []TestFile{
|
||||
|
|
Loading…
Reference in a new issue