diff --git a/changelog/0.8.2/pull-1549 b/changelog/0.8.2/pull-1549 new file mode 100644 index 000000000..b337facd3 --- /dev/null +++ b/changelog/0.8.2/pull-1549 @@ -0,0 +1,9 @@ +Enhancement: Speed up querying across indices and scanning existing files + +This change increases the whenever a blob (part of a file) is searched for in a +restic repository. This will reduce cpu usage some when backing up files already +backed up by restic. Cpu usage is further decreased when scanning files. + +https://github.com/restic/restic/issues/1234 +https://github.com/restic/restic/pull/55555 +https://forum.restic/.net/foo/bar/baz diff --git a/cmd/restic/cmd_cat.go b/cmd/restic/cmd_cat.go index 7e66af734..98c97a5a5 100644 --- a/cmd/restic/cmd_cat.go +++ b/cmd/restic/cmd_cat.go @@ -165,8 +165,8 @@ func runCat(gopts GlobalOptions, args []string) error { case "blob": for _, t := range []restic.BlobType{restic.DataBlob, restic.TreeBlob} { - list, err := repo.Index().Lookup(id, t) - if err != nil { + list, found := repo.Index().Lookup(id, t) + if !found { continue } blob := list[0] diff --git a/cmd/restic/cmd_diff.go b/cmd/restic/cmd_diff.go index dfaf8c85d..9d9057d33 100644 --- a/cmd/restic/cmd_diff.go +++ b/cmd/restic/cmd_diff.go @@ -135,9 +135,9 @@ func updateBlobs(repo restic.Repository, blobs restic.BlobSet, stats *DiffStat) stats.TreeBlobs++ } - size, err := repo.LookupBlobSize(h.ID, h.Type) - if err != nil { - Warnf("unable to find blob size for %v: %v\n", h, err) + size, found := repo.LookupBlobSize(h.ID, h.Type) + if !found { + Warnf("unable to find blob size for %v\n", h) continue } diff --git a/cmd/restic/cmd_dump.go b/cmd/restic/cmd_dump.go index eb8250979..39c861bff 100644 --- a/cmd/restic/cmd_dump.go +++ b/cmd/restic/cmd_dump.go @@ -59,9 +59,9 @@ func splitPath(path string) []string { func dumpNode(ctx context.Context, repo restic.Repository, node *restic.Node) error { var buf []byte for _, id := range node.Content { - size, err := repo.LookupBlobSize(id, restic.DataBlob) - if err != nil { - return err + size, found := repo.LookupBlobSize(id, restic.DataBlob) + if !found { + return errors.Errorf("id %v not found in repository", id) } buf = buf[:cap(buf)] diff --git a/internal/archiver/archive_reader_test.go b/internal/archiver/archive_reader_test.go index fafc0ed1a..d0fdb06cf 100644 --- a/internal/archiver/archive_reader_test.go +++ b/internal/archiver/archive_reader_test.go @@ -43,9 +43,9 @@ func checkSavedFile(t *testing.T, repo restic.Repository, treeID restic.ID, name // check blobs for i, id := range node.Content { - size, err := repo.LookupBlobSize(id, restic.DataBlob) - if err != nil { - t.Fatal(err) + size, found := repo.LookupBlobSize(id, restic.DataBlob) + if !found { + t.Fatal("Failed to find blob", id.Str()) } buf := restic.NewBlobBuffer(int(size)) @@ -55,7 +55,7 @@ func checkSavedFile(t *testing.T, repo restic.Repository, treeID restic.ID, name } buf2 := make([]byte, int(size)) - _, err = io.ReadFull(rd, buf2) + _, err := io.ReadFull(rd, buf2) if err != nil { t.Fatal(err) } diff --git a/internal/archiver/archiver.go b/internal/archiver/archiver.go index 20ba863c3..b1e612409 100644 --- a/internal/archiver/archiver.go +++ b/internal/archiver/archiver.go @@ -86,8 +86,7 @@ func (arch *Archiver) isKnownBlob(id restic.ID, t restic.BlobType) bool { arch.knownBlobs.Insert(id) - _, err := arch.repo.Index().Lookup(id, t) - if err == nil { + if arch.repo.Index().Has(id, t) { return true } diff --git a/internal/debug/log_test.go b/internal/debug/log_test.go new file mode 100644 index 000000000..647e16e85 --- /dev/null +++ b/internal/debug/log_test.go @@ -0,0 +1,34 @@ +package debug_test + +import ( + "github.com/restic/restic/internal/debug" + "github.com/restic/restic/internal/restic" + + "testing" +) + +func BenchmarkLogStatic(b *testing.B) { + for i := 0; i < b.N; i++ { + debug.Log("Static string") + } +} + +func BenchmarkLogIDStr(b *testing.B) { + id := restic.NewRandomID() + + b.ResetTimer() + + for i := 0; i < b.N; i++ { + debug.Log("id: %v", id.Str()) + } +} + +func BenchmarkLogIDString(b *testing.B) { + id := restic.NewRandomID() + + b.ResetTimer() + + for i := 0; i < b.N; i++ { + debug.Log("id: %v", id.String()) + } +} diff --git a/internal/fuse/file.go b/internal/fuse/file.go index 233875993..80cdd59ad 100644 --- a/internal/fuse/file.go +++ b/internal/fuse/file.go @@ -4,6 +4,7 @@ package fuse import ( + "github.com/restic/restic/internal/errors" "github.com/restic/restic/internal/restic" "github.com/restic/restic/internal/debug" @@ -36,9 +37,10 @@ func newFile(ctx context.Context, root *Root, inode uint64, node *restic.Node) ( for i, id := range node.Content { size, ok := root.blobSizeCache.Lookup(id) if !ok { - size, err = root.repo.LookupBlobSize(id, restic.DataBlob) - if err != nil { - return nil, err + var found bool + size, found = root.repo.LookupBlobSize(id, restic.DataBlob) + if !found { + return nil, errors.Errorf("id %v not found in repository", id) } } diff --git a/internal/fuse/file_test.go b/internal/fuse/file_test.go index 622b5dd80..c59e731f9 100644 --- a/internal/fuse/file_test.go +++ b/internal/fuse/file_test.go @@ -87,8 +87,8 @@ func TestFuseFile(t *testing.T) { memfile []byte ) for _, id := range content { - size, err := repo.LookupBlobSize(id, restic.DataBlob) - rtest.OK(t, err) + size, found := repo.LookupBlobSize(id, restic.DataBlob) + rtest.Assert(t, found, "Expected to find blob id %v", id) filesize += uint64(size) buf := restic.NewBlobBuffer(int(size)) diff --git a/internal/repository/index.go b/internal/repository/index.go index b42ea6baf..b9ed9cf98 100644 --- a/internal/repository/index.go +++ b/internal/repository/index.go @@ -110,7 +110,7 @@ func (idx *Index) Store(blob restic.PackedBlob) { } // Lookup queries the index for the blob ID and returns a restic.PackedBlob. -func (idx *Index) Lookup(id restic.ID, tpe restic.BlobType) (blobs []restic.PackedBlob, err error) { +func (idx *Index) Lookup(id restic.ID, tpe restic.BlobType) (blobs []restic.PackedBlob, found bool) { idx.m.Lock() defer idx.m.Unlock() @@ -120,9 +120,6 @@ func (idx *Index) Lookup(id restic.ID, tpe restic.BlobType) (blobs []restic.Pack blobs = make([]restic.PackedBlob, 0, len(packs)) for _, p := range packs { - debug.Log("id %v found in pack %v at %d, length %d", - id.Str(), p.packID.Str(), p.offset, p.length) - blob := restic.PackedBlob{ Blob: restic.Blob{ Type: tpe, @@ -136,11 +133,10 @@ func (idx *Index) Lookup(id restic.ID, tpe restic.BlobType) (blobs []restic.Pack blobs = append(blobs, blob) } - return blobs, nil + return blobs, true } - debug.Log("id %v not found", id.Str()) - return nil, errors.Errorf("id %v not found in index", id) + return nil, false } // ListPack returns a list of blobs contained in a pack. @@ -180,13 +176,13 @@ func (idx *Index) Has(id restic.ID, tpe restic.BlobType) bool { // LookupSize returns the length of the plaintext content of the blob with the // given id. -func (idx *Index) LookupSize(id restic.ID, tpe restic.BlobType) (plaintextLength uint, err error) { - blobs, err := idx.Lookup(id, tpe) - if err != nil { - return 0, err +func (idx *Index) LookupSize(id restic.ID, tpe restic.BlobType) (plaintextLength uint, found bool) { + blobs, found := idx.Lookup(id, tpe) + if !found { + return 0, found } - return uint(restic.PlaintextLength(int(blobs[0].Length))), nil + return uint(restic.PlaintextLength(int(blobs[0].Length))), true } // Supersedes returns the list of indexes this index supersedes, if any. diff --git a/internal/repository/index_test.go b/internal/repository/index_test.go index f2e0746c0..599f243f3 100644 --- a/internal/repository/index_test.go +++ b/internal/repository/index_test.go @@ -65,8 +65,8 @@ func TestIndexSerialize(t *testing.T) { rtest.OK(t, err) for _, testBlob := range tests { - list, err := idx.Lookup(testBlob.id, testBlob.tpe) - rtest.OK(t, err) + list, found := idx.Lookup(testBlob.id, testBlob.tpe) + rtest.Assert(t, found, "Expected to find blob id %v", testBlob.id.Str()) if len(list) != 1 { t.Errorf("expected one result for blob %v, got %v: %v", testBlob.id.Str(), len(list), list) @@ -78,8 +78,8 @@ func TestIndexSerialize(t *testing.T) { rtest.Equals(t, testBlob.offset, result.Offset) rtest.Equals(t, testBlob.length, result.Length) - list2, err := idx2.Lookup(testBlob.id, testBlob.tpe) - rtest.OK(t, err) + list2, found := idx2.Lookup(testBlob.id, testBlob.tpe) + rtest.Assert(t, found, "Expected to find blob id %v", testBlob.id) if len(list2) != 1 { t.Errorf("expected one result for blob %v, got %v: %v", testBlob.id.Str(), len(list2), list2) @@ -146,8 +146,8 @@ func TestIndexSerialize(t *testing.T) { // all new blobs must be in the index for _, testBlob := range newtests { - list, err := idx3.Lookup(testBlob.id, testBlob.tpe) - rtest.OK(t, err) + list, found := idx3.Lookup(testBlob.id, testBlob.tpe) + rtest.Assert(t, found, "Expected to find blob id %v", testBlob.id.Str()) if len(list) != 1 { t.Errorf("expected one result for blob %v, got %v: %v", testBlob.id.Str(), len(list), list) @@ -293,8 +293,8 @@ func TestIndexUnserialize(t *testing.T) { rtest.OK(t, err) for _, test := range exampleTests { - list, err := idx.Lookup(test.id, test.tpe) - rtest.OK(t, err) + list, found := idx.Lookup(test.id, test.tpe) + rtest.Assert(t, found, "Expected to find blob id %v", test.id.Str()) if len(list) != 1 { t.Errorf("expected one result for blob %v, got %v: %v", test.id.Str(), len(list), list) @@ -341,8 +341,8 @@ func TestIndexUnserializeOld(t *testing.T) { rtest.OK(t, err) for _, test := range exampleTests { - list, err := idx.Lookup(test.id, test.tpe) - rtest.OK(t, err) + list, found := idx.Lookup(test.id, test.tpe) + rtest.Assert(t, found, "Expected to find blob id %v", test.id.Str()) if len(list) != 1 { t.Errorf("expected one result for blob %v, got %v: %v", test.id.Str(), len(list), list) @@ -383,16 +383,23 @@ func TestIndexPacks(t *testing.T) { const maxPackSize = 16 * 1024 * 1024 -func createRandomIndex() (idx *repository.Index, lookupID restic.ID) { +// This function generates a (insecure) random ID, similar to NewRandomID +func NewRandomTestID(rng *rand.Rand) restic.ID { + id := restic.ID{} + rng.Read(id[:]) + return id +} + +func createRandomIndex(rng *rand.Rand) (idx *repository.Index, lookupID restic.ID) { idx = repository.NewIndex() // create index with 200k pack files for i := 0; i < 200000; i++ { - packID := restic.NewRandomID() + packID := NewRandomTestID(rng) offset := 0 for offset < maxPackSize { size := 2000 + rand.Intn(4*1024*1024) - id := restic.NewRandomID() + id := NewRandomTestID(rng) idx.Store(restic.PackedBlob{ PackID: packID, Blob: restic.Blob{ @@ -415,7 +422,7 @@ func createRandomIndex() (idx *repository.Index, lookupID restic.ID) { } func BenchmarkIndexHasUnknown(b *testing.B) { - idx, _ := createRandomIndex() + idx, _ := createRandomIndex(rand.New(rand.NewSource(0))) lookupID := restic.NewRandomID() b.ResetTimer() @@ -426,7 +433,7 @@ func BenchmarkIndexHasUnknown(b *testing.B) { } func BenchmarkIndexHasKnown(b *testing.B) { - idx, lookupID := createRandomIndex() + idx, lookupID := createRandomIndex(rand.New(rand.NewSource(0))) b.ResetTimer() diff --git a/internal/repository/master_index.go b/internal/repository/master_index.go index ad458774d..02a27d5d1 100644 --- a/internal/repository/master_index.go +++ b/internal/repository/master_index.go @@ -4,7 +4,6 @@ import ( "context" "sync" - "github.com/restic/restic/internal/errors" "github.com/restic/restic/internal/restic" "github.com/restic/restic/internal/debug" @@ -22,36 +21,32 @@ func NewMasterIndex() *MasterIndex { } // Lookup queries all known Indexes for the ID and returns the first match. -func (mi *MasterIndex) Lookup(id restic.ID, tpe restic.BlobType) (blobs []restic.PackedBlob, err error) { +func (mi *MasterIndex) Lookup(id restic.ID, tpe restic.BlobType) (blobs []restic.PackedBlob, found bool) { mi.idxMutex.RLock() defer mi.idxMutex.RUnlock() - debug.Log("looking up id %v, tpe %v", id.Str(), tpe) - for _, idx := range mi.idx { - blobs, err = idx.Lookup(id, tpe) - if err == nil { - debug.Log("found id %v: %v", id.Str(), blobs) + blobs, found = idx.Lookup(id, tpe) + if found { return } } - debug.Log("id %v not found in any index", id.Str()) - return nil, errors.Errorf("id %v not found in any index", id) + return nil, false } // LookupSize queries all known Indexes for the ID and returns the first match. -func (mi *MasterIndex) LookupSize(id restic.ID, tpe restic.BlobType) (uint, error) { +func (mi *MasterIndex) LookupSize(id restic.ID, tpe restic.BlobType) (uint, bool) { mi.idxMutex.RLock() defer mi.idxMutex.RUnlock() for _, idx := range mi.idx { - if idx.Has(id, tpe) { - return idx.LookupSize(id, tpe) + if size, found := idx.LookupSize(id, tpe); found { + return size, found } } - return 0, errors.Errorf("id %v not found in any index", id) + return 0, false } // ListPack returns the list of blobs in a pack. The first matching index is diff --git a/internal/repository/master_index_test.go b/internal/repository/master_index_test.go new file mode 100644 index 000000000..7dfcdda5f --- /dev/null +++ b/internal/repository/master_index_test.go @@ -0,0 +1,123 @@ +package repository_test + +import ( + "math/rand" + "testing" + + "github.com/restic/restic/internal/repository" + "github.com/restic/restic/internal/restic" + rtest "github.com/restic/restic/internal/test" +) + +func TestMasterIndexLookup(t *testing.T) { + idInIdx1 := restic.NewRandomID() + idInIdx2 := restic.NewRandomID() + + blob1 := restic.PackedBlob{ + PackID: restic.NewRandomID(), + Blob: restic.Blob{ + Type: restic.DataBlob, + ID: idInIdx1, + Length: 10, + Offset: 0, + }, + } + + blob2 := restic.PackedBlob{ + PackID: restic.NewRandomID(), + Blob: restic.Blob{ + Type: restic.DataBlob, + ID: idInIdx2, + Length: 100, + Offset: 10, + }, + } + + idx1 := repository.NewIndex() + idx1.Store(blob1) + + idx2 := repository.NewIndex() + idx2.Store(blob2) + + mIdx := repository.NewMasterIndex() + mIdx.Insert(idx1) + mIdx.Insert(idx2) + + blobs, found := mIdx.Lookup(idInIdx1, restic.DataBlob) + rtest.Assert(t, found, "Expected to find blob id %v from index 1", idInIdx1) + rtest.Equals(t, []restic.PackedBlob{blob1}, blobs) + + blobs, found = mIdx.Lookup(idInIdx2, restic.DataBlob) + rtest.Assert(t, found, "Expected to find blob id %v from index 2", idInIdx2) + rtest.Equals(t, []restic.PackedBlob{blob2}, blobs) + + blobs, found = mIdx.Lookup(restic.NewRandomID(), restic.DataBlob) + rtest.Assert(t, !found, "Expected to not find a blob when fetching with a random id") + rtest.Assert(t, blobs == nil, "Expected no blobs when fetching with a random id") +} + +func BenchmarkMasterIndexLookupSingleIndex(b *testing.B) { + idx1, lookupID := createRandomIndex(rand.New(rand.NewSource(0))) + + mIdx := repository.NewMasterIndex() + mIdx.Insert(idx1) + + b.ResetTimer() + + for i := 0; i < b.N; i++ { + mIdx.Lookup(lookupID, restic.DataBlob) + } +} + +func BenchmarkMasterIndexLookupMultipleIndex(b *testing.B) { + rng := rand.New(rand.NewSource(0)) + mIdx := repository.NewMasterIndex() + + for i := 0; i < 5; i++ { + idx, _ := createRandomIndex(rand.New(rng)) + mIdx.Insert(idx) + } + + idx1, lookupID := createRandomIndex(rand.New(rng)) + mIdx.Insert(idx1) + + b.ResetTimer() + + for i := 0; i < b.N; i++ { + mIdx.Lookup(lookupID, restic.DataBlob) + } +} + +func BenchmarkMasterIndexLookupSingleIndexUnknown(b *testing.B) { + lookupID := restic.NewRandomID() + idx1, _ := createRandomIndex(rand.New(rand.NewSource(0))) + + mIdx := repository.NewMasterIndex() + mIdx.Insert(idx1) + + b.ResetTimer() + + for i := 0; i < b.N; i++ { + mIdx.Lookup(lookupID, restic.DataBlob) + } +} + +func BenchmarkMasterIndexLookupMultipleIndexUnknown(b *testing.B) { + rng := rand.New(rand.NewSource(0)) + lookupID := restic.NewRandomID() + mIdx := repository.NewMasterIndex() + + for i := 0; i < 5; i++ { + idx, _ := createRandomIndex(rand.New(rng)) + mIdx.Insert(idx) + } + + idx1, _ := createRandomIndex(rand.New(rng)) + mIdx.Insert(idx1) + + b.ResetTimer() + + for i := 0; i < b.N; i++ { + mIdx.Lookup(lookupID, restic.DataBlob) + } +} diff --git a/internal/repository/repack_test.go b/internal/repository/repack_test.go index d0b1d907d..286962f49 100644 --- a/internal/repository/repack_test.go +++ b/internal/repository/repack_test.go @@ -122,9 +122,9 @@ func findPacksForBlobs(t *testing.T, repo restic.Repository, blobs restic.BlobSe idx := repo.Index() for h := range blobs { - list, err := idx.Lookup(h.ID, h.Type) - if err != nil { - t.Fatal(err) + list, found := idx.Lookup(h.ID, h.Type) + if !found { + t.Fatal("Failed to find blob", h.ID.Str(), "with type", h.Type) } for _, pb := range list { @@ -227,8 +227,8 @@ func TestRepack(t *testing.T) { idx := repo.Index() for h := range keepBlobs { - list, err := idx.Lookup(h.ID, h.Type) - if err != nil { + list, found := idx.Lookup(h.ID, h.Type) + if !found { t.Errorf("unable to find blob %v in repo", h.ID.Str()) continue } @@ -246,7 +246,7 @@ func TestRepack(t *testing.T) { } for h := range removeBlobs { - if _, err := idx.Lookup(h.ID, h.Type); err == nil { + if _, found := idx.Lookup(h.ID, h.Type); found { t.Errorf("blob %v still contained in the repo", h) } } diff --git a/internal/repository/repository.go b/internal/repository/repository.go index d28a9ddd7..a09ba036b 100644 --- a/internal/repository/repository.go +++ b/internal/repository/repository.go @@ -115,10 +115,10 @@ func (r *Repository) loadBlob(ctx context.Context, id restic.ID, t restic.BlobTy debug.Log("load %v with id %v (buf len %v, cap %d)", t, id.Str(), len(plaintextBuf), cap(plaintextBuf)) // lookup packs - blobs, err := r.idx.Lookup(id, t) - if err != nil { - debug.Log("id %v not found in index: %v", id.Str(), err) - return 0, err + blobs, found := r.idx.Lookup(id, t) + if !found { + debug.Log("id %v not found in index", id.Str()) + return 0, errors.Errorf("id %v not found in repository", id) } // try cached pack files first @@ -193,7 +193,7 @@ func (r *Repository) LoadJSONUnpacked(ctx context.Context, t restic.FileType, id } // LookupBlobSize returns the size of blob id. -func (r *Repository) LookupBlobSize(id restic.ID, tpe restic.BlobType) (uint, error) { +func (r *Repository) LookupBlobSize(id restic.ID, tpe restic.BlobType) (uint, bool) { return r.idx.LookupSize(id, tpe) } @@ -576,9 +576,9 @@ func (r *Repository) Close() error { // space. func (r *Repository) LoadBlob(ctx context.Context, t restic.BlobType, id restic.ID, buf []byte) (int, error) { debug.Log("load blob %v into buf (len %v, cap %v)", id.Str(), len(buf), cap(buf)) - size, err := r.idx.LookupSize(id, t) - if err != nil { - return 0, err + size, found := r.idx.LookupSize(id, t) + if !found { + return 0, errors.Errorf("id %v not found in repository", id) } if cap(buf) < restic.CiphertextLength(int(size)) { @@ -610,9 +610,9 @@ func (r *Repository) SaveBlob(ctx context.Context, t restic.BlobType, buf []byte func (r *Repository) LoadTree(ctx context.Context, id restic.ID) (*restic.Tree, error) { debug.Log("load tree %v", id.Str()) - size, err := r.idx.LookupSize(id, restic.TreeBlob) - if err != nil { - return nil, err + size, found := r.idx.LookupSize(id, restic.TreeBlob) + if !found { + return nil, errors.Errorf("tree %v not found in repository", id) } debug.Log("size is %d, create buffer", size) diff --git a/internal/restic/node.go b/internal/restic/node.go index 81576057a..33e5285a8 100644 --- a/internal/restic/node.go +++ b/internal/restic/node.go @@ -285,9 +285,9 @@ func (node Node) createFileAt(ctx context.Context, path string, repo Repository, func (node Node) writeNodeContent(ctx context.Context, repo Repository, f *os.File) error { var buf []byte for _, id := range node.Content { - size, err := repo.LookupBlobSize(id, DataBlob) - if err != nil { - return err + size, found := repo.LookupBlobSize(id, DataBlob) + if !found { + return errors.Errorf("id %v not found in repository", id) } buf = buf[:cap(buf)] diff --git a/internal/restic/repository.go b/internal/restic/repository.go index 21b36b2d8..5a3a01a8a 100644 --- a/internal/restic/repository.go +++ b/internal/restic/repository.go @@ -24,7 +24,7 @@ type Repository interface { Config() Config - LookupBlobSize(ID, BlobType) (uint, error) + LookupBlobSize(ID, BlobType) (uint, bool) // List calls the function fn for each file of type t in the repository. // When an error is returned by fn, processing stops and List() returns the @@ -57,7 +57,7 @@ type Lister interface { // Index keeps track of the blobs are stored within files. type Index interface { Has(ID, BlobType) bool - Lookup(ID, BlobType) ([]PackedBlob, error) + Lookup(ID, BlobType) ([]PackedBlob, bool) Count(BlobType) uint // Each returns a channel that yields all blobs known to the index. When