Merge pull request #2781 from aawsome/reduce-index-memory

Reduce index memory
2020-06-14 10:58:28 +02:00 · 2020-06-14 10:58:28 +02:00 · 74bc7141c1
commit 74bc7141c1
parent 1d66bb9e62 1361341c58
4 changed files with 174 additions and 112 deletions
--- a/changelog/unreleased/pull-2781
+++ b/changelog/unreleased/pull-2781
@ -0,0 +1,6 @@
+Enhancement: Reduce memory consumption of in-memory index
+
+We've improved how the index is stored in memory.
+This change reduces memory usage for large repositories by about 30-40%.
+
+https://github.com/restic/restic/pull/2781
--- a/internal/repository/index.go
+++ b/internal/repository/index.go
@ -13,11 +13,48 @@ import (
 	"github.com/restic/restic/internal/debug"
 )

-// Index holds a lookup table for id -> pack.
+// In large repositories, millions of blobs are stored in the repository
+// and restic needs to store an index entry for each blob in memory for
+// most operations.
+// Hence the index data structure defined here is one of the main contributions
+// to the total memory requirements of restic.
+//
+// We use two maps to store each index entry.
+// The first map stores the first entry of a blobtype/blobID
+// The key of the map is a BlobHandle
+// The entries are the actual index entries.
+// In the second map we store duplicate index entries, i.e. entries with same
+// blobtype/blobID
+// In the index entries, we need to reference the packID. As one pack may
+// contain many blobs the packIDs are saved in a separate array and only the index
+// within this array is saved in the indexEntry
+//
+// To compute the needed amount of memory, we need some assumptions.
+// Maps need an overhead of allocated but not needed elements.
+// For computations, we assume an overhead of 50% and use OF=1.5 (overhead factor)
+// As duplicates are only present in edge cases and are also removed by prune runs,
+// we assume that there are no significant duplicates and omit them in the calculations.
+// Moreover we asssume on average a minimum of 8 blobs per pack; BP=8
+// (Note that for large files there should be 3 blobs per pack as the average chunk
+// size is 1.5 MB and the minimum pack size is 4 MB)
+//
+// We have the following sizes:
+// key: 32 + 1 = 33 bytes
+// indexEntry:  8 + 4 + 4 = 16 bytes
+// each packID: 32 bytes
+//
+// To save N index entries, we therefore need:
+// N * OF * (33 + 16) bytes + N * 32 bytes / BP = N * 78 bytes
+
+// Index holds lookup tables for id -> pack.
 type Index struct {
-	m         sync.Mutex
-	pack      map[restic.BlobHandle][]indexEntry
-	treePacks restic.IDs
+	m          sync.Mutex
+	blob       map[restic.BlobHandle]indexEntry
+	duplicates map[restic.BlobHandle][]indexEntry
+	packs      restic.IDs
+	treePacks  restic.IDs
+	// only used by Store, StorePacks does not check for already saved packIDs
+	packIDToIndex map[restic.ID]int

 	final      bool      // set to true for all indexes read from the backend ("finalized")
 	id         restic.ID // set to the ID of the index when it's finalized
@ -26,27 +63,60 @@ type Index struct {
 }

 type indexEntry struct {
-	packID restic.ID
-	offset uint
-	length uint
+	// only save index do packs; i.e. packs[packindex] yields the packID
+	packIndex int
+	offset    uint32
+	length    uint32
 }

 // NewIndex returns a new index.
 func NewIndex() *Index {
 	return &Index{
-		pack:    make(map[restic.BlobHandle][]indexEntry),
-		created: time.Now(),
+		blob:          make(map[restic.BlobHandle]indexEntry),
+		duplicates:    make(map[restic.BlobHandle][]indexEntry),
+		packIDToIndex: make(map[restic.ID]int),
+		created:       time.Now(),
 	}
 }

-func (idx *Index) store(blob restic.PackedBlob) {
+// withDuplicates returns the list of all entries for the given blob handle
+func (idx *Index) withDuplicates(h restic.BlobHandle, entry indexEntry) []indexEntry {
+	entries, ok := idx.duplicates[h]
+	if ok {
+		all := make([]indexEntry, len(entries)+1)
+		all[0] = entry
+		copy(all[1:], entries)
+		return all
+	}
+
+	return []indexEntry{entry}
+}
+
+// addToPacks saves the given pack ID and return the index.
+// This procedere allows to use pack IDs which can be easily garbage collected after.
+func (idx *Index) addToPacks(id restic.ID) int {
+	idx.packs = append(idx.packs, id)
+	return len(idx.packs) - 1
+}
+
+const maxuint32 = 1<<32 - 1
+
+func (idx *Index) store(packIndex int, blob restic.Blob) {
+	// assert that offset and length fit into uint32!
+	if blob.Offset > maxuint32 || blob.Length > maxuint32 {
+		panic("offset or length does not fit in uint32. You have packs > 4GB!")
+	}
 	newEntry := indexEntry{
-		packID: blob.PackID,
-		offset: blob.Offset,
-		length: blob.Length,
+		packIndex: packIndex,
+		offset:    uint32(blob.Offset),
+		length:    uint32(blob.Length),
 	}
 	h := restic.BlobHandle{ID: blob.ID, Type: blob.Type}
-	idx.pack[h] = append(idx.pack[h], newEntry)
+	if _, ok := idx.blob[h]; ok {
+		idx.duplicates[h] = append(idx.duplicates[h], newEntry)
+	} else {
+		idx.blob[h] = newEntry
+	}
 }

 // Final returns true iff the index is already written to the repository, it is
@ -70,7 +140,7 @@ var IndexFull = func(idx *Index) bool {

 	debug.Log("checking whether index %p is full", idx)

-	blobs := len(idx.pack)
+	blobs := len(idx.blob)
 	age := time.Now().Sub(idx.created)

 	switch {
@ -98,7 +168,14 @@ func (idx *Index) Store(blob restic.PackedBlob) {

 	debug.Log("%v", blob)

-	idx.store(blob)
+	// get packIndex and save if new packID
+	packIndex, ok := idx.packIDToIndex[blob.PackID]
+	if !ok {
+		packIndex = idx.addToPacks(blob.PackID)
+		idx.packIDToIndex[blob.PackID] = packIndex
+	}
+
+	idx.store(packIndex, blob.Blob)
 }

 // StorePack remembers the ids of all blobs of a given pack
@ -112,9 +189,23 @@ func (idx *Index) StorePack(id restic.ID, blobs []restic.Blob) {
 	}

 	debug.Log("%v", blobs)
+	packIndex := idx.addToPacks(id)

 	for _, blob := range blobs {
-		idx.store(restic.PackedBlob{Blob: blob, PackID: id})
+		idx.store(packIndex, blob)
+	}
+}
+
+// ListPack returns a list of blobs contained in a pack.
+func (idx *Index) indexEntryToPackedBlob(h restic.BlobHandle, entry indexEntry) restic.PackedBlob {
+	return restic.PackedBlob{
+		Blob: restic.Blob{
+			ID:     h.ID,
+			Type:   h.Type,
+			Length: uint(entry.length),
+			Offset: uint(entry.offset),
+		},
+		PackID: idx.packs[entry.packIndex],
 	}
 }

@ -125,21 +216,13 @@ func (idx *Index) Lookup(id restic.ID, tpe restic.BlobType) (blobs []restic.Pack

 	h := restic.BlobHandle{ID: id, Type: tpe}

-	if packs, ok := idx.pack[h]; ok {
-		blobs = make([]restic.PackedBlob, 0, len(packs))
+	blob, ok := idx.blob[h]
+	if ok {
+		blobList := idx.withDuplicates(h, blob)
+		blobs = make([]restic.PackedBlob, 0, len(blobList))

-		for _, p := range packs {
-			blob := restic.PackedBlob{
-				Blob: restic.Blob{
-					Type:   tpe,
-					Length: p.length,
-					ID:     id,
-					Offset: p.offset,
-				},
-				PackID: p.packID,
-			}
-
-			blobs = append(blobs, blob)
+		for _, p := range blobList {
+			blobs = append(blobs, idx.indexEntryToPackedBlob(h, p))
 		}

 		return blobs, true
@ -153,18 +236,10 @@ func (idx *Index) ListPack(id restic.ID) (list []restic.PackedBlob) {
 	idx.m.Lock()
 	defer idx.m.Unlock()

-	for h, packList := range idx.pack {
-		for _, entry := range packList {
-			if entry.packID == id {
-				list = append(list, restic.PackedBlob{
-					Blob: restic.Blob{
-						ID:     h.ID,
-						Type:   h.Type,
-						Length: entry.length,
-						Offset: entry.offset,
-					},
-					PackID: entry.packID,
-				})
+	for h, entry := range idx.blob {
+		for _, blob := range idx.withDuplicates(h, entry) {
+			if idx.packs[blob.packIndex] == id {
+				list = append(list, idx.indexEntryToPackedBlob(h, blob))
 			}
 		}
 	}
@ -179,7 +254,7 @@ func (idx *Index) Has(id restic.ID, tpe restic.BlobType) bool {

 	h := restic.BlobHandle{ID: id, Type: tpe}

-	_, ok := idx.pack[h]
+	_, ok := idx.blob[h]
 	return ok
 }

@ -227,20 +302,12 @@ func (idx *Index) Each(ctx context.Context) <-chan restic.PackedBlob {
 			close(ch)
 		}()

-		for h, packs := range idx.pack {
-			for _, blob := range packs {
+		for h, entry := range idx.blob {
+			for _, blob := range idx.withDuplicates(h, entry) {
 				select {
 				case <-ctx.Done():
 					return
-				case ch <- restic.PackedBlob{
-					Blob: restic.Blob{
-						ID:     h.ID,
-						Type:   h.Type,
-						Offset: blob.offset,
-						Length: blob.length,
-					},
-					PackID: blob.packID,
-				}:
+				case ch <- idx.indexEntryToPackedBlob(h, blob):
 				}
 			}
 		}
@ -255,10 +322,8 @@ func (idx *Index) Packs() restic.IDSet {
 	defer idx.m.Unlock()

 	packs := restic.NewIDSet()
-	for _, list := range idx.pack {
-		for _, entry := range list {
-			packs.Insert(entry.packID)
-		}
+	for _, packID := range idx.packs {
+		packs.Insert(packID)
 	}

 	return packs
@ -270,12 +335,17 @@ func (idx *Index) Count(t restic.BlobType) (n uint) {
 	idx.m.Lock()
 	defer idx.m.Unlock()

-	for h, list := range idx.pack {
+	for h := range idx.blob {
 		if h.Type != t {
 			continue
 		}
-
-		n += uint(len(list))
+		n++
+	}
+	for h, dups := range idx.duplicates {
+		if h.Type != t {
+			continue
+		}
+		n += uint(len(dups))
 	}

 	return
@ -298,25 +368,20 @@ func (idx *Index) generatePackList() ([]*packJSON, error) {
 	list := []*packJSON{}
 	packs := make(map[restic.ID]*packJSON)

-	for h, packedBlobs := range idx.pack {
-		for _, blob := range packedBlobs {
-			if blob.packID.IsNull() {
+	for h, entry := range idx.blob {
+		for _, blob := range idx.withDuplicates(h, entry) {
+			packID := idx.packs[blob.packIndex]
+			if packID.IsNull() {
 				panic("null pack id")
 			}

 			debug.Log("handle blob %v", h)

-			if blob.packID.IsNull() {
-				debug.Log("blob %v has no packID! (offset %v, length %v)",
-					h, blob.offset, blob.length)
-				return nil, errors.Errorf("unable to serialize index: pack for blob %v hasn't been written yet", h)
-			}
-
 			// see if pack is already in map
-			p, ok := packs[blob.packID]
+			p, ok := packs[packID]
 			if !ok {
 				// else create new pack
-				p = &packJSON{ID: blob.packID}
+				p = &packJSON{ID: packID}

 				// and append it to the list and map
 				list = append(list, p)
@ -327,8 +392,8 @@ func (idx *Index) generatePackList() ([]*packJSON, error) {
 			p.Blobs = append(p.Blobs, blobJSON{
 				ID:     h.ID,
 				Type:   h.Type,
-				Offset: blob.offset,
-				Length: blob.length,
+				Offset: uint(blob.offset),
+				Length: uint(blob.length),
 			})
 		}
 	}
@ -376,6 +441,8 @@ func (idx *Index) Finalize() {
 	defer idx.m.Unlock()

 	idx.final = true
+	// clear packIDToIndex as no more elements will be added
+	idx.packIDToIndex = nil
 }

 // ID returns the ID of the index, if available. If the index is not yet
@ -480,16 +547,14 @@ func DecodeIndex(buf []byte) (idx *Index, err error) {
 	idx = NewIndex()
 	for _, pack := range idxJSON.Packs {
 		var data, tree bool
+		packID := idx.addToPacks(pack.ID)

 		for _, blob := range pack.Blobs {
-			idx.store(restic.PackedBlob{
-				Blob: restic.Blob{
-					Type:   blob.Type,
-					ID:     blob.ID,
-					Offset: blob.Offset,
-					Length: blob.Length,
-				},
-				PackID: pack.ID,
+			idx.store(packID, restic.Blob{
+				Type:   blob.Type,
+				ID:     blob.ID,
+				Offset: blob.Offset,
+				Length: blob.Length,
 			})

 			switch blob.Type {
@ -525,16 +590,14 @@ func DecodeOldIndex(buf []byte) (idx *Index, err error) {
 	idx = NewIndex()
 	for _, pack := range list {
 		var data, tree bool
+		packID := idx.addToPacks(pack.ID)

 		for _, blob := range pack.Blobs {
-			idx.store(restic.PackedBlob{
-				Blob: restic.Blob{
-					Type:   blob.Type,
-					ID:     blob.ID,
-					Offset: blob.Offset,
-					Length: blob.Length,
-				},
-				PackID: pack.ID,
+			idx.store(packID, restic.Blob{
+				Type:   blob.Type,
+				ID:     blob.ID,
+				Offset: blob.Offset,
+				Length: blob.Length,
 			})

 			switch blob.Type {
--- a/internal/repository/index_test.go
+++ b/internal/repository/index_test.go
@ -398,18 +398,16 @@ func createRandomIndex(rng *rand.Rand) (idx *repository.Index, lookupID restic.I
 	// create index with 200k pack files
 	for i := 0; i < 200000; i++ {
 		packID := NewRandomTestID(rng)
+		var blobs []restic.Blob
 		offset := 0
 		for offset < maxPackSize {
 			size := 2000 + rand.Intn(4*1024*1024)
 			id := NewRandomTestID(rng)
-			idx.Store(restic.PackedBlob{
-				PackID: packID,
-				Blob: restic.Blob{
-					Type:   restic.DataBlob,
-					ID:     id,
-					Length: uint(size),
-					Offset: uint(offset),
-				},
+			blobs = append(blobs, restic.Blob{
+				Type:   restic.DataBlob,
+				ID:     id,
+				Length: uint(size),
+				Offset: uint(offset),
 			})

 			offset += size
@ -418,6 +416,7 @@ func createRandomIndex(rng *rand.Rand) (idx *repository.Index, lookupID restic.I
 				lookupID = id
 			}
 		}
+		idx.StorePack(packID, blobs)
 	}

 	return idx, lookupID
@ -444,6 +443,13 @@ func BenchmarkIndexHasKnown(b *testing.B) {
 	}
 }

+func BenchmarkIndexAlloc(b *testing.B) {
+	b.ReportAllocs()
+	for i := 0; i < b.N; i++ {
+		createRandomIndex(rand.New(rand.NewSource(0)))
+	}
+}
+
 func TestIndexHas(t *testing.T) {
 	type testEntry struct {
 		id             restic.ID
--- a/internal/repository/master_index.go
+++ b/internal/repository/master_index.go
@ -132,19 +132,6 @@ func (mi *MasterIndex) Insert(idx *Index) {
 	mi.idx = append(mi.idx, idx)
 }

-// Remove deletes an index from the MasterIndex.
-func (mi *MasterIndex) Remove(index *Index) {
-	mi.idxMutex.Lock()
-	defer mi.idxMutex.Unlock()
-
-	for i, idx := range mi.idx {
-		if idx == index {
-			mi.idx = append(mi.idx[:i], mi.idx[i+1:]...)
-			return
-		}
-	}
-}
-
 // Store remembers the id and pack in the index.
 func (mi *MasterIndex) StorePack(id restic.ID, blobs []restic.Blob) {
 	mi.idxMutex.Lock()