add changelog, benchmark, memory calculation

2020-06-12 12:57:23 +02:00 · 2020-06-12 12:57:23 +02:00 · 7419844885
commit 7419844885
parent dd7b4f54f5
3 changed files with 42 additions and 8 deletions
--- a/changelog/unreleased/pull-2781
+++ b/changelog/unreleased/pull-2781
@ -0,0 +1,6 @@
+Enhancement: Reduce memory consumption of in-memory index
+
+We've improved how the index is stored in memory.
+This change reduces memory usage for large repositories by about 30-40%.
+
+https://github.com/restic/restic/pull/2781
--- a/internal/repository/index.go
+++ b/internal/repository/index.go
@ -13,6 +13,28 @@ import (
 	"github.com/restic/restic/internal/debug"
 )

+// In large repositories, millions of blobs are stored in the repository
+// and restic needs to store an index entry for each blob in memory for
+// most operations.
+// Hence the index data structure defined here is one of the main contributions
+// to the total memory requirements of restic.
+//
+// We use a map to store each index entry.
+// The key of the map is a BlobHandle
+// The entries of the maps are slices which contain the actual index entries.
+//
+// To compute the needed amount of memory, we need some assumptions.
+// Maps need an overhead of allocated but not needed elements.
+// For computations, we assume an overhead of 50% and use OF=1.5 (overhead factor)
+//
+// We have the following sizes:
+// key: 32 + 1 = 33 bytes
+// slice: 24 bytes (pointer, len and cap)
+// indexEntry:  32 + 8 + 8 = 48 bytes
+//
+// To save N index entries, we therefore need:
+// N * OF * (33 + 24) bytes + N * 48 bytes = N * 134 bytes
+
 // Index holds a lookup table for id -> pack.
 type Index struct {
 	m         sync.Mutex
--- a/internal/repository/index_test.go
+++ b/internal/repository/index_test.go
@ -398,18 +398,16 @@ func createRandomIndex(rng *rand.Rand) (idx *repository.Index, lookupID restic.I
 	// create index with 200k pack files
 	for i := 0; i < 200000; i++ {
 		packID := NewRandomTestID(rng)
+		var blobs []restic.Blob
 		offset := 0
 		for offset < maxPackSize {
 			size := 2000 + rand.Intn(4*1024*1024)
 			id := NewRandomTestID(rng)
-			idx.Store(restic.PackedBlob{
-				PackID: packID,
-				Blob: restic.Blob{
-					Type:   restic.DataBlob,
-					ID:     id,
-					Length: uint(size),
-					Offset: uint(offset),
-				},
+			blobs = append(blobs, restic.Blob{
+				Type:   restic.DataBlob,
+				ID:     id,
+				Length: uint(size),
+				Offset: uint(offset),
 			})

 			offset += size
@ -418,6 +416,7 @@ func createRandomIndex(rng *rand.Rand) (idx *repository.Index, lookupID restic.I
 				lookupID = id
 			}
 		}
+		idx.StorePack(packID, blobs)
 	}

 	return idx, lookupID
@ -444,6 +443,13 @@ func BenchmarkIndexHasKnown(b *testing.B) {
 	}
 }

+func BenchmarkIndexAlloc(b *testing.B) {
+	b.ReportAllocs()
+	for i := 0; i < b.N; i++ {
+		createRandomIndex(rand.New(rand.NewSource(0)))
+	}
+}
+
 func TestIndexHas(t *testing.T) {
 	type testEntry struct {
 		id             restic.ID