Merge pull request #2599 from MichaelEischer/tweak-mem-usage

Reduce memory usage when searching for used blobs
2020-08-01 13:49:02 +02:00 · 2020-08-01 13:49:02 +02:00 · b7b479b668
commit b7b479b668
parent 4cf9656f12 2d7ab9115f
6 changed files with 54 additions and 30 deletions
--- a/changelog/unreleased/pull-2599
+++ b/changelog/unreleased/pull-2599
@ -0,0 +1,6 @@
+Enhancement: Slightly reduce memory usage of prune and stats commands
+
+The prune and the stats command kept directory identifiers in memory twice
+while searching for used blobs.
+
+https://github.com/restic/restic/pull/2599
--- a/cmd/restic/cmd_prune.go
+++ b/cmd/restic/cmd_prune.go
@ -189,14 +189,13 @@ func pruneRepository(gopts GlobalOptions, repo restic.Repository) error {
 	Verbosef("find data that is still in use for %d snapshots\n", stats.snapshots)

 	usedBlobs := restic.NewBlobSet()
-	seenBlobs := restic.NewBlobSet()

 	bar = newProgressMax(!gopts.Quiet, uint64(len(snapshots)), "snapshots")
 	bar.Start()
 	for _, sn := range snapshots {
 		debug.Log("process snapshot %v", sn.ID())

-		err = restic.FindUsedBlobs(ctx, repo, *sn.Tree, usedBlobs, seenBlobs)
+		err = restic.FindUsedBlobs(ctx, repo, *sn.Tree, usedBlobs)
 		if err != nil {
 			if repo.Backend().IsNotExist(err) {
 				return errors.Fatal("unable to load a tree from the repo: " + err.Error())
--- a/cmd/restic/cmd_stats.go
+++ b/cmd/restic/cmd_stats.go
@ -93,7 +93,6 @@ func runStats(gopts GlobalOptions, args []string) error {
 		uniqueInodes: make(map[uint64]struct{}),
 		fileBlobs:    make(map[string]restic.IDSet),
 		blobs:        restic.NewBlobSet(),
-		blobsSeen:    restic.NewBlobSet(),
 	}

 	if snapshotIDString != "" {
@ -183,7 +182,7 @@ func statsWalkSnapshot(ctx context.Context, snapshot *restic.Snapshot, repo rest
 	if countMode == countModeRawData {
 		// count just the sizes of unique blobs; we don't need to walk the tree
 		// ourselves in this case, since a nifty function does it for us
-		return restic.FindUsedBlobs(ctx, repo, *snapshot.Tree, stats.blobs, stats.blobsSeen)
+		return restic.FindUsedBlobs(ctx, repo, *snapshot.Tree, stats.blobs)
 	}

 	err := walker.Walk(ctx, repo, *snapshot.Tree, restic.NewIDSet(), statsWalkTree(repo, stats))
@ -318,9 +317,9 @@ type statsContainer struct {
 	// blobs that have been seen as a part of the file
 	fileBlobs map[string]restic.IDSet

-	// blobs and blobsSeen are used to count individual
-	// unique blobs, independent of references to files
-	blobs, blobsSeen restic.BlobSet
+	// blobs is used to count individual unique blobs,
+	// independent of references to files
+	blobs restic.BlobSet
 }

 // fileID is a 256-bit hash that distinguishes unique files.
--- a/internal/restic/find.go
+++ b/internal/restic/find.go
@ -2,11 +2,19 @@ package restic

 import "context"

+// TreeLoader loads a tree from a repository.
+type TreeLoader interface {
+	LoadTree(context.Context, ID) (*Tree, error)
+}
+
 // FindUsedBlobs traverses the tree ID and adds all seen blobs (trees and data
-// blobs) to the set blobs. The tree blobs in the `seen` BlobSet will not be visited
-// again.
-func FindUsedBlobs(ctx context.Context, repo Repository, treeID ID, blobs BlobSet, seen BlobSet) error {
-	blobs.Insert(BlobHandle{ID: treeID, Type: TreeBlob})
+// blobs) to the set blobs. Already seen tree blobs will not be visited again.
+func FindUsedBlobs(ctx context.Context, repo TreeLoader, treeID ID, blobs BlobSet) error {
+	h := BlobHandle{ID: treeID, Type: TreeBlob}
+	if blobs.Has(h) {
+		return nil
+	}
+	blobs.Insert(h)

 	tree, err := repo.LoadTree(ctx, treeID)
 	if err != nil {
@ -20,15 +28,7 @@ func FindUsedBlobs(ctx context.Context, repo Repository, treeID ID, blobs BlobSe
 				blobs.Insert(BlobHandle{ID: blob, Type: DataBlob})
 			}
 		case "dir":
-			subtreeID := *node.Subtree
-			h := BlobHandle{ID: subtreeID, Type: TreeBlob}
-			if seen.Has(h) {
-				continue
-			}
-
-			seen.Insert(h)
-
-			err := FindUsedBlobs(ctx, repo, subtreeID, blobs, seen)
+			err := FindUsedBlobs(ctx, repo, *node.Subtree, blobs)
 			if err != nil {
 				return err
 			}
--- a/internal/restic/find_test.go
+++ b/internal/restic/find_test.go
@ -12,6 +12,7 @@ import (
 	"testing"
 	"time"

+	"github.com/restic/restic/internal/errors"
 	"github.com/restic/restic/internal/repository"
 	"github.com/restic/restic/internal/restic"
 )
@ -93,7 +94,7 @@ func TestFindUsedBlobs(t *testing.T) {

 	for i, sn := range snapshots {
 		usedBlobs := restic.NewBlobSet()
-		err := restic.FindUsedBlobs(context.TODO(), repo, *sn.Tree, usedBlobs, restic.NewBlobSet())
+		err := restic.FindUsedBlobs(context.TODO(), repo, *sn.Tree, usedBlobs)
 		if err != nil {
 			t.Errorf("FindUsedBlobs returned error: %v", err)
 			continue
@ -118,6 +119,31 @@ func TestFindUsedBlobs(t *testing.T) {
 	}
 }

+type ForbiddenRepo struct{}
+
+func (r ForbiddenRepo) LoadTree(ctx context.Context, id restic.ID) (*restic.Tree, error) {
+	return nil, errors.New("should not be called")
+}
+
+func TestFindUsedBlobsSkipsSeenBlobs(t *testing.T) {
+	repo, cleanup := repository.TestRepository(t)
+	defer cleanup()
+
+	snapshot := restic.TestCreateSnapshot(t, repo, findTestTime, findTestDepth, 0)
+	t.Logf("snapshot %v saved, tree %v", snapshot.ID().Str(), snapshot.Tree.Str())
+
+	usedBlobs := restic.NewBlobSet()
+	err := restic.FindUsedBlobs(context.TODO(), repo, *snapshot.Tree, usedBlobs)
+	if err != nil {
+		t.Fatalf("FindUsedBlobs returned error: %v", err)
+	}
+
+	err = restic.FindUsedBlobs(context.TODO(), ForbiddenRepo{}, *snapshot.Tree, usedBlobs)
+	if err != nil {
+		t.Fatalf("FindUsedBlobs returned error: %v", err)
+	}
+}
+
 func BenchmarkFindUsedBlobs(b *testing.B) {
 	repo, cleanup := repository.TestRepository(b)
 	defer cleanup()
@ -127,9 +153,8 @@ func BenchmarkFindUsedBlobs(b *testing.B) {
 	b.ResetTimer()

 	for i := 0; i < b.N; i++ {
-		seen := restic.NewBlobSet()
 		blobs := restic.NewBlobSet()
-		err := restic.FindUsedBlobs(context.TODO(), repo, *sn.Tree, blobs, seen)
+		err := restic.FindUsedBlobs(context.TODO(), repo, *sn.Tree, blobs)
 		if err != nil {
 			b.Error(err)
 		}
--- a/internal/walker/walker.go
+++ b/internal/walker/walker.go
@ -10,11 +10,6 @@ import (
 	"github.com/restic/restic/internal/restic"
 )

-// TreeLoader loads a tree from a repository.
-type TreeLoader interface {
-	LoadTree(context.Context, restic.ID) (*restic.Tree, error)
-}
-
 // SkipNode is returned by WalkFunc when a dir node should not be walked.
 var SkipNode = errors.New("skip this node")

@ -38,7 +33,7 @@ type WalkFunc func(parentTreeID restic.ID, path string, node *restic.Node, nodeE
 // Walk calls walkFn recursively for each node in root. If walkFn returns an
 // error, it is passed up the call stack. The trees in ignoreTrees are not
 // walked. If walkFn ignores trees, these are added to the set.
-func Walk(ctx context.Context, repo TreeLoader, root restic.ID, ignoreTrees restic.IDSet, walkFn WalkFunc) error {
+func Walk(ctx context.Context, repo restic.TreeLoader, root restic.ID, ignoreTrees restic.IDSet, walkFn WalkFunc) error {
 	tree, err := repo.LoadTree(ctx, root)
 	_, err = walkFn(root, "/", nil, err)

@ -60,7 +55,7 @@ func Walk(ctx context.Context, repo TreeLoader, root restic.ID, ignoreTrees rest
 // walk recursively traverses the tree, ignoring subtrees when the ID of the
 // subtree is in ignoreTrees. If err is nil and ignore is true, the subtree ID
 // will be added to ignoreTrees by walk.
-func walk(ctx context.Context, repo TreeLoader, prefix string, parentTreeID restic.ID, tree *restic.Tree, ignoreTrees restic.IDSet, walkFn WalkFunc) (ignore bool, err error) {
+func walk(ctx context.Context, repo restic.TreeLoader, prefix string, parentTreeID restic.ID, tree *restic.Tree, ignoreTrees restic.IDSet, walkFn WalkFunc) (ignore bool, err error) {
 	var allNodesIgnored = true

 	if len(tree.Nodes) == 0 {