Merge pull request #3854 from MichaelEischer/sparsefiles

restore: Add support for sparse files
This commit is contained in:
Michael Eischer 2022-09-24 22:04:02 +02:00 committed by GitHub
commit 78d2312ee9
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
16 changed files with 393 additions and 88 deletions

View file

@ -0,0 +1,17 @@
Enhancement: Restore files with many zeros as sparse files
When using `restore --sparse`, the restorer may now write files containing long
runs of zeros as sparse files (also called files with holes): the zeros are not
actually written to disk.
How much space is saved by writing sparse files depends on the operating
system, file system and the distribution of zeros in the file.
During backup restic still reads the whole file including sparse regions. We
have optimized the processing speed of sparse regions.
https://github.com/restic/restic/issues/79
https://github.com/restic/restic/issues/3903
https://github.com/restic/restic/pull/2601
https://github.com/restic/restic/pull/3854
https://forum.restic.net/t/sparse-file-support/1264

View file

@ -42,6 +42,7 @@ type RestoreOptions struct {
InsensitiveInclude []string
Target string
snapshotFilterOptions
Sparse bool
Verify bool
}
@ -58,6 +59,7 @@ func init() {
flags.StringVarP(&restoreOptions.Target, "target", "t", "", "directory to extract data to")
initSingleSnapshotFilterOptions(flags, &restoreOptions.snapshotFilterOptions)
flags.BoolVar(&restoreOptions.Sparse, "sparse", false, "restore files as sparse")
flags.BoolVar(&restoreOptions.Verify, "verify", false, "verify restored files content")
}
@ -147,7 +149,7 @@ func runRestore(opts RestoreOptions, gopts GlobalOptions, args []string) error {
return err
}
res, err := restorer.NewRestorer(ctx, repo, id)
res, err := restorer.NewRestorer(ctx, repo, id, opts.Sparse)
if err != nil {
Exitf(2, "creating restorer failed: %v\n", err)
}

View file

@ -818,7 +818,14 @@ func (r *Repository) SaveBlob(ctx context.Context, t restic.BlobType, buf []byte
// compute plaintext hash if not already set
if id.IsNull() {
newID = restic.Hash(buf)
// Special case the hash calculation for all zero chunks. This is especially
// useful for sparse files containing large all zero regions. For these we can
// process chunks as fast as we can read the from disk.
if len(buf) == chunker.MinSize && restic.ZeroPrefixLen(buf) == chunker.MinSize {
newID = ZeroChunk()
} else {
newID = restic.Hash(buf)
}
} else {
newID = id
}
@ -972,3 +979,14 @@ func streamPackPart(ctx context.Context, beLoad BackendLoadFn, key *crypto.Key,
})
return errors.Wrap(err, "StreamPack")
}
var zeroChunkOnce sync.Once
var zeroChunkID restic.ID
// ZeroChunk computes and returns (cached) the ID of an all-zero chunk with size chunker.MinSize
func ZeroChunk() restic.ID {
zeroChunkOnce.Do(func() {
zeroChunkID = restic.Hash(make([]byte, chunker.MinSize))
})
return zeroChunkID
}

View file

@ -0,0 +1,21 @@
package restic
import "bytes"
// ZeroPrefixLen returns the length of the longest all-zero prefix of p.
func ZeroPrefixLen(p []byte) (n int) {
// First skip 1kB-sized blocks, for speed.
var zeros [1024]byte
for len(p) >= len(zeros) && bytes.Equal(p[:len(zeros)], zeros[:]) {
p = p[len(zeros):]
n += len(zeros)
}
for len(p) > 0 && p[0] == 0 {
p = p[1:]
n++
}
return n
}

View file

@ -0,0 +1,52 @@
package restic_test
import (
"math/rand"
"testing"
"github.com/restic/restic/internal/restic"
"github.com/restic/restic/internal/test"
)
func TestZeroPrefixLen(t *testing.T) {
var buf [2048]byte
// test zero prefixes of various lengths
for i := len(buf) - 1; i >= 0; i-- {
buf[i] = 42
skipped := restic.ZeroPrefixLen(buf[:])
test.Equals(t, i, skipped)
}
// test buffers of various sizes
for i := 0; i < len(buf); i++ {
skipped := restic.ZeroPrefixLen(buf[i:])
test.Equals(t, 0, skipped)
}
}
func BenchmarkZeroPrefixLen(b *testing.B) {
var (
buf [4<<20 + 37]byte
r = rand.New(rand.NewSource(0x618732))
sumSkipped int64
)
b.ReportAllocs()
b.SetBytes(int64(len(buf)))
b.ResetTimer()
for i := 0; i < b.N; i++ {
j := r.Intn(len(buf))
buf[j] = 0xff
skipped := restic.ZeroPrefixLen(buf[:])
sumSkipped += int64(skipped)
buf[j] = 0
}
// The closer this is to .5, the better. If it's far off, give the
// benchmark more time to run with -benchtime.
b.Logf("average number of zeros skipped: %.3f",
float64(sumSkipped)/(float64(b.N*len(buf))))
}

View file

@ -27,6 +27,7 @@ const (
type fileInfo struct {
lock sync.Mutex
inProgress bool
sparse bool
size int64
location string // file on local filesystem relative to restorer basedir
blobs interface{} // blobs of the file
@ -51,6 +52,8 @@ type fileRestorer struct {
workerCount int
filesWriter *filesWriter
zeroChunk restic.ID
sparse bool
dst string
files []*fileInfo
@ -61,7 +64,8 @@ func newFileRestorer(dst string,
packLoader repository.BackendLoadFn,
key *crypto.Key,
idx func(restic.BlobHandle) []restic.PackedBlob,
connections uint) *fileRestorer {
connections uint,
sparse bool) *fileRestorer {
// as packs are streamed the concurrency is limited by IO
workerCount := int(connections)
@ -71,6 +75,8 @@ func newFileRestorer(dst string,
idx: idx,
packLoader: packLoader,
filesWriter: newFilesWriter(workerCount),
zeroChunk: repository.ZeroChunk(),
sparse: sparse,
workerCount: workerCount,
dst: dst,
Error: restorerAbortOnAllErrors,
@ -133,7 +139,16 @@ func (r *fileRestorer) restoreFiles(ctx context.Context) error {
packOrder = append(packOrder, packID)
}
pack.files[file] = struct{}{}
if blob.ID.Equal(r.zeroChunk) {
file.sparse = r.sparse
}
})
if len(fileBlobs) == 1 {
// no need to preallocate files with a single block, thus we can always consider them to be sparse
// in addition, a short chunk will never match r.zeroChunk which would prevent sparseness for short files
file.sparse = r.sparse
}
if err != nil {
// repository index is messed up, can't do anything
return err
@ -253,7 +268,7 @@ func (r *fileRestorer) downloadPack(ctx context.Context, pack *packInfo) error {
file.inProgress = true
createSize = file.size
}
return r.filesWriter.writeToFile(r.targetPath(file.location), blobData, offset, createSize)
return r.filesWriter.writeToFile(r.targetPath(file.location), blobData, offset, createSize, file.sparse)
}
err := sanitizeError(file, writeToFile())
if err != nil {

View file

@ -147,10 +147,10 @@ func newTestRepo(content []TestFile) *TestRepo {
return repo
}
func restoreAndVerify(t *testing.T, tempdir string, content []TestFile, files map[string]bool) {
func restoreAndVerify(t *testing.T, tempdir string, content []TestFile, files map[string]bool, sparse bool) {
repo := newTestRepo(content)
r := newFileRestorer(tempdir, repo.loader, repo.key, repo.Lookup, 2)
r := newFileRestorer(tempdir, repo.loader, repo.key, repo.Lookup, 2, sparse)
if files == nil {
r.files = repo.files
@ -188,30 +188,32 @@ func TestFileRestorerBasic(t *testing.T) {
tempdir, cleanup := rtest.TempDir(t)
defer cleanup()
restoreAndVerify(t, tempdir, []TestFile{
{
name: "file1",
blobs: []TestBlob{
{"data1-1", "pack1-1"},
{"data1-2", "pack1-2"},
for _, sparse := range []bool{false, true} {
restoreAndVerify(t, tempdir, []TestFile{
{
name: "file1",
blobs: []TestBlob{
{"data1-1", "pack1-1"},
{"data1-2", "pack1-2"},
},
},
},
{
name: "file2",
blobs: []TestBlob{
{"data2-1", "pack2-1"},
{"data2-2", "pack2-2"},
{
name: "file2",
blobs: []TestBlob{
{"data2-1", "pack2-1"},
{"data2-2", "pack2-2"},
},
},
},
{
name: "file3",
blobs: []TestBlob{
// same blob multiple times
{"data3-1", "pack3-1"},
{"data3-1", "pack3-1"},
{
name: "file3",
blobs: []TestBlob{
// same blob multiple times
{"data3-1", "pack3-1"},
{"data3-1", "pack3-1"},
},
},
},
}, nil)
}, nil, sparse)
}
}
func TestFileRestorerPackSkip(t *testing.T) {
@ -221,28 +223,30 @@ func TestFileRestorerPackSkip(t *testing.T) {
files := make(map[string]bool)
files["file2"] = true
restoreAndVerify(t, tempdir, []TestFile{
{
name: "file1",
blobs: []TestBlob{
{"data1-1", "pack1"},
{"data1-2", "pack1"},
{"data1-3", "pack1"},
{"data1-4", "pack1"},
{"data1-5", "pack1"},
{"data1-6", "pack1"},
for _, sparse := range []bool{false, true} {
restoreAndVerify(t, tempdir, []TestFile{
{
name: "file1",
blobs: []TestBlob{
{"data1-1", "pack1"},
{"data1-2", "pack1"},
{"data1-3", "pack1"},
{"data1-4", "pack1"},
{"data1-5", "pack1"},
{"data1-6", "pack1"},
},
},
},
{
name: "file2",
blobs: []TestBlob{
// file is contained in pack1 but need pack parts to be skipped
{"data1-2", "pack1"},
{"data1-4", "pack1"},
{"data1-6", "pack1"},
{
name: "file2",
blobs: []TestBlob{
// file is contained in pack1 but need pack parts to be skipped
{"data1-2", "pack1"},
{"data1-4", "pack1"},
{"data1-6", "pack1"},
},
},
},
}, files)
}, files, sparse)
}
}
func TestErrorRestoreFiles(t *testing.T) {
@ -264,7 +268,7 @@ func TestErrorRestoreFiles(t *testing.T) {
return loadError
}
r := newFileRestorer(tempdir, repo.loader, repo.key, repo.Lookup, 2)
r := newFileRestorer(tempdir, repo.loader, repo.key, repo.Lookup, 2, false)
r.files = repo.files
err := r.restoreFiles(context.TODO())
@ -304,7 +308,7 @@ func testPartialDownloadError(t *testing.T, part int) {
return loader(ctx, h, length, offset, fn)
}
r := newFileRestorer(tempdir, repo.loader, repo.key, repo.Lookup, 2)
r := newFileRestorer(tempdir, repo.loader, repo.key, repo.Lookup, 2, false)
r.files = repo.files
r.Error = func(s string, e error) error {
// ignore errors as in the `restore` command

View file

@ -19,30 +19,34 @@ type filesWriter struct {
type filesWriterBucket struct {
lock sync.Mutex
files map[string]*os.File
users map[string]int
files map[string]*partialFile
}
type partialFile struct {
*os.File
users int // Reference count.
sparse bool
}
func newFilesWriter(count int) *filesWriter {
buckets := make([]filesWriterBucket, count)
for b := 0; b < count; b++ {
buckets[b].files = make(map[string]*os.File)
buckets[b].users = make(map[string]int)
buckets[b].files = make(map[string]*partialFile)
}
return &filesWriter{
buckets: buckets,
}
}
func (w *filesWriter) writeToFile(path string, blob []byte, offset int64, createSize int64) error {
func (w *filesWriter) writeToFile(path string, blob []byte, offset int64, createSize int64, sparse bool) error {
bucket := &w.buckets[uint(xxhash.Sum64String(path))%uint(len(w.buckets))]
acquireWriter := func() (*os.File, error) {
acquireWriter := func() (*partialFile, error) {
bucket.lock.Lock()
defer bucket.lock.Unlock()
if wr, ok := bucket.files[path]; ok {
bucket.users[path]++
bucket.files[path].users++
return wr, nil
}
@ -53,39 +57,45 @@ func (w *filesWriter) writeToFile(path string, blob []byte, offset int64, create
flags = os.O_WRONLY
}
wr, err := os.OpenFile(path, flags, 0600)
f, err := os.OpenFile(path, flags, 0600)
if err != nil {
return nil, err
}
wr := &partialFile{File: f, users: 1, sparse: sparse}
bucket.files[path] = wr
bucket.users[path] = 1
if createSize >= 0 {
err := preallocateFile(wr, createSize)
if err != nil {
// Just log the preallocate error but don't let it cause the restore process to fail.
// Preallocate might return an error if the filesystem (implementation) does not
// support preallocation or our parameters combination to the preallocate call
// This should yield a syscall.ENOTSUP error, but some other errors might also
// show up.
debug.Log("Failed to preallocate %v with size %v: %v", path, createSize, err)
if sparse {
err = truncateSparse(f, createSize)
if err != nil {
return nil, err
}
} else {
err := preallocateFile(wr.File, createSize)
if err != nil {
// Just log the preallocate error but don't let it cause the restore process to fail.
// Preallocate might return an error if the filesystem (implementation) does not
// support preallocation or our parameters combination to the preallocate call
// This should yield a syscall.ENOTSUP error, but some other errors might also
// show up.
debug.Log("Failed to preallocate %v with size %v: %v", path, createSize, err)
}
}
}
return wr, nil
}
releaseWriter := func(wr *os.File) error {
releaseWriter := func(wr *partialFile) error {
bucket.lock.Lock()
defer bucket.lock.Unlock()
if bucket.users[path] == 1 {
if bucket.files[path].users == 1 {
delete(bucket.files, path)
delete(bucket.users, path)
return wr.Close()
}
bucket.users[path]--
bucket.files[path].users--
return nil
}

View file

@ -16,21 +16,17 @@ func TestFilesWriterBasic(t *testing.T) {
f1 := dir + "/f1"
f2 := dir + "/f2"
rtest.OK(t, w.writeToFile(f1, []byte{1}, 0, 2))
rtest.OK(t, w.writeToFile(f1, []byte{1}, 0, 2, false))
rtest.Equals(t, 0, len(w.buckets[0].files))
rtest.Equals(t, 0, len(w.buckets[0].users))
rtest.OK(t, w.writeToFile(f2, []byte{2}, 0, 2))
rtest.OK(t, w.writeToFile(f2, []byte{2}, 0, 2, false))
rtest.Equals(t, 0, len(w.buckets[0].files))
rtest.Equals(t, 0, len(w.buckets[0].users))
rtest.OK(t, w.writeToFile(f1, []byte{1}, 1, -1))
rtest.OK(t, w.writeToFile(f1, []byte{1}, 1, -1, false))
rtest.Equals(t, 0, len(w.buckets[0].files))
rtest.Equals(t, 0, len(w.buckets[0].users))
rtest.OK(t, w.writeToFile(f2, []byte{2}, 1, -1))
rtest.OK(t, w.writeToFile(f2, []byte{2}, 1, -1, false))
rtest.Equals(t, 0, len(w.buckets[0].files))
rtest.Equals(t, 0, len(w.buckets[0].users))
buf, err := ioutil.ReadFile(f1)
rtest.OK(t, err)

View file

@ -16,8 +16,9 @@ import (
// Restorer is used to restore a snapshot to a directory.
type Restorer struct {
repo restic.Repository
sn *restic.Snapshot
repo restic.Repository
sn *restic.Snapshot
sparse bool
Error func(location string, err error) error
SelectFilter func(item string, dstpath string, node *restic.Node) (selectedForRestore bool, childMayBeSelected bool)
@ -26,9 +27,10 @@ type Restorer struct {
var restorerAbortOnAllErrors = func(location string, err error) error { return err }
// NewRestorer creates a restorer preloaded with the content from the snapshot id.
func NewRestorer(ctx context.Context, repo restic.Repository, id restic.ID) (*Restorer, error) {
func NewRestorer(ctx context.Context, repo restic.Repository, id restic.ID, sparse bool) (*Restorer, error) {
r := &Restorer{
repo: repo,
sparse: sparse,
Error: restorerAbortOnAllErrors,
SelectFilter: func(string, string, *restic.Node) (bool, bool) { return true, true },
}
@ -219,7 +221,7 @@ func (res *Restorer) RestoreTo(ctx context.Context, dst string) error {
}
idx := NewHardlinkIndex()
filerestorer := newFileRestorer(dst, res.repo.Backend().Load, res.repo.Key(), res.repo.Index().Lookup, res.repo.Connections())
filerestorer := newFileRestorer(dst, res.repo.Backend().Load, res.repo.Key(), res.repo.Index().Lookup, res.repo.Connections(), res.sparse)
filerestorer.Error = res.Error
debug.Log("first pass for %q", dst)

View file

@ -4,6 +4,7 @@ import (
"bytes"
"context"
"io/ioutil"
"math"
"os"
"path/filepath"
"runtime"
@ -11,6 +12,7 @@ import (
"testing"
"time"
"github.com/restic/restic/internal/archiver"
"github.com/restic/restic/internal/fs"
"github.com/restic/restic/internal/repository"
"github.com/restic/restic/internal/restic"
@ -324,7 +326,7 @@ func TestRestorer(t *testing.T) {
_, id := saveSnapshot(t, repo, test.Snapshot)
t.Logf("snapshot saved as %v", id.Str())
res, err := NewRestorer(context.TODO(), repo, id)
res, err := NewRestorer(context.TODO(), repo, id, false)
if err != nil {
t.Fatal(err)
}
@ -447,7 +449,7 @@ func TestRestorerRelative(t *testing.T) {
_, id := saveSnapshot(t, repo, test.Snapshot)
t.Logf("snapshot saved as %v", id.Str())
res, err := NewRestorer(context.TODO(), repo, id)
res, err := NewRestorer(context.TODO(), repo, id, false)
if err != nil {
t.Fatal(err)
}
@ -682,7 +684,7 @@ func TestRestorerTraverseTree(t *testing.T) {
defer cleanup()
sn, id := saveSnapshot(t, repo, test.Snapshot)
res, err := NewRestorer(context.TODO(), repo, id)
res, err := NewRestorer(context.TODO(), repo, id, false)
if err != nil {
t.Fatal(err)
}
@ -764,7 +766,7 @@ func TestRestorerConsistentTimestampsAndPermissions(t *testing.T) {
},
})
res, err := NewRestorer(context.TODO(), repo, id)
res, err := NewRestorer(context.TODO(), repo, id, false)
rtest.OK(t, err)
res.SelectFilter = func(item string, dstpath string, node *restic.Node) (selectedForRestore bool, childMayBeSelected bool) {
@ -824,7 +826,7 @@ func TestVerifyCancel(t *testing.T) {
_, id := saveSnapshot(t, repo, snapshot)
res, err := NewRestorer(context.TODO(), repo, id)
res, err := NewRestorer(context.TODO(), repo, id, false)
rtest.OK(t, err)
tempdir, cleanup := rtest.TempDir(t)
@ -849,3 +851,58 @@ func TestVerifyCancel(t *testing.T) {
rtest.Equals(t, 1, len(errs))
rtest.Assert(t, strings.Contains(errs[0].Error(), "Invalid file size for"), "wrong error %q", errs[0].Error())
}
func TestRestorerSparseFiles(t *testing.T) {
repo, cleanup := repository.TestRepository(t)
defer cleanup()
var zeros [1<<20 + 13]byte
target := &fs.Reader{
Mode: 0600,
Name: "/zeros",
ReadCloser: ioutil.NopCloser(bytes.NewReader(zeros[:])),
}
sc := archiver.NewScanner(target)
err := sc.Scan(context.TODO(), []string{"/zeros"})
rtest.OK(t, err)
arch := archiver.New(repo, target, archiver.Options{})
_, id, err := arch.Snapshot(context.Background(), []string{"/zeros"},
archiver.SnapshotOptions{})
rtest.OK(t, err)
res, err := NewRestorer(context.TODO(), repo, id, true)
rtest.OK(t, err)
tempdir, cleanup := rtest.TempDir(t)
defer cleanup()
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
err = res.RestoreTo(ctx, tempdir)
rtest.OK(t, err)
filename := filepath.Join(tempdir, "zeros")
content, err := ioutil.ReadFile(filename)
rtest.OK(t, err)
rtest.Equals(t, len(zeros[:]), len(content))
rtest.Equals(t, zeros[:], content)
blocks := getBlockCount(t, filename)
if blocks < 0 {
return
}
// st.Blocks is the size in 512-byte blocks.
denseBlocks := math.Ceil(float64(len(zeros)) / 512)
sparsity := 1 - float64(blocks)/denseBlocks
// This should report 100% sparse. We don't assert that,
// as the behavior of sparse writes depends on the underlying
// file system as well as the OS.
t.Logf("wrote %d zeros as %d blocks, %.1f%% sparse",
len(zeros), blocks, 100*sparsity)
}

View file

@ -30,7 +30,7 @@ func TestRestorerRestoreEmptyHardlinkedFileds(t *testing.T) {
},
})
res, err := NewRestorer(context.TODO(), repo, id)
res, err := NewRestorer(context.TODO(), repo, id, false)
rtest.OK(t, err)
res.SelectFilter = func(item string, dstpath string, node *restic.Node) (selectedForRestore bool, childMayBeSelected bool) {
@ -60,3 +60,13 @@ func TestRestorerRestoreEmptyHardlinkedFileds(t *testing.T) {
rtest.Equals(t, s1.Ino, s2.Ino)
}
}
func getBlockCount(t *testing.T, filename string) int64 {
fi, err := os.Stat(filename)
rtest.OK(t, err)
st := fi.Sys().(*syscall.Stat_t)
if st == nil {
return -1
}
return st.Blocks
}

View file

@ -0,0 +1,35 @@
//go:build windows
// +build windows
package restorer
import (
"math"
"syscall"
"testing"
"unsafe"
rtest "github.com/restic/restic/internal/test"
"golang.org/x/sys/windows"
)
func getBlockCount(t *testing.T, filename string) int64 {
libkernel32 := windows.NewLazySystemDLL("kernel32.dll")
err := libkernel32.Load()
rtest.OK(t, err)
proc := libkernel32.NewProc("GetCompressedFileSizeW")
err = proc.Find()
rtest.OK(t, err)
namePtr, err := syscall.UTF16PtrFromString(filename)
rtest.OK(t, err)
result, _, _ := proc.Call(uintptr(unsafe.Pointer(namePtr)), 0)
const invalidFileSize = uintptr(4294967295)
if result == invalidFileSize {
return -1
}
return int64(math.Ceil(float64(result) / 512))
}

View file

@ -0,0 +1,37 @@
//go:build !windows
// +build !windows
package restorer
import (
"github.com/restic/restic/internal/restic"
)
// WriteAt writes p to f.File at offset. It tries to do a sparse write
// and updates f.size.
func (f *partialFile) WriteAt(p []byte, offset int64) (n int, err error) {
if !f.sparse {
return f.File.WriteAt(p, offset)
}
n = len(p)
// Skip the longest all-zero prefix of p.
// If it's long enough, we can punch a hole in the file.
skipped := restic.ZeroPrefixLen(p)
p = p[skipped:]
offset += int64(skipped)
switch {
case len(p) == 0:
// All zeros, file already big enough. A previous WriteAt or
// Truncate will have produced the zeros in f.File.
default:
var n2 int
n2, err = f.File.WriteAt(p, offset)
n = skipped + n2
}
return n, err
}

View file

@ -0,0 +1,10 @@
//go:build !windows
// +build !windows
package restorer
import "os"
func truncateSparse(f *os.File, size int64) error {
return f.Truncate(size)
}

View file

@ -0,0 +1,19 @@
package restorer
import (
"os"
"github.com/restic/restic/internal/debug"
"golang.org/x/sys/windows"
)
func truncateSparse(f *os.File, size int64) error {
// try setting the sparse file attribute, but ignore the error if it fails
var t uint32
err := windows.DeviceIoControl(windows.Handle(f.Fd()), windows.FSCTL_SET_SPARSE, nil, 0, nil, 0, &t, nil)
if err != nil {
debug.Log("failed to set sparse attribute for %v: %v", f.Name(), err)
}
return f.Truncate(size)
}