Don't save exact duplicates when merging indexes

2020-08-04 16:42:38 +02:00 · 2020-08-04 16:42:38 +02:00 · b112533812
commit b112533812
parent 5e63294355
2 changed files with 57 additions and 5 deletions
--- a/internal/repository/index.go
+++ b/internal/repository/index.go
@ -464,7 +464,7 @@ func (idx *Index) TreePacks() restic.IDs {
 }

 // merge() merges indexes, i.e. idx.merge(idx2) merges the contents of idx2 into idx.
-// idx2 is not changed by this method.
+// During merging exact duplicates are removed;  idx2 is not changed by this method.
 func (idx *Index) merge(idx2 *Index) error {
 	idx.m.Lock()
 	defer idx.m.Unlock()
@ -476,18 +476,35 @@ func (idx *Index) merge(idx2 *Index) error {
 	}

 	packlen := len(idx.packs)
+	// first append packs as they might be accessed when looking for duplicates below
+	idx.packs = append(idx.packs, idx2.packs...)
+
 	// copy all index entries of idx2 to idx
 	for typ := range idx2.byType {
 		m2 := &idx2.byType[typ]
 		m := &idx.byType[typ]
-		m2.foreach(func(entry *indexEntry) bool {
-			// packIndex is changed as idx2.pack is appended to idx.pack, see below
-			m.add(entry.id, entry.packIndex+packlen, entry.offset, entry.length)
+
+		// helper func to test if identical entry is contained in idx
+		hasIdenticalEntry := func(e2 *indexEntry) (found bool) {
+			m.foreachWithID(e2.id, func(e *indexEntry) {
+				b := idx.toPackedBlob(e, restic.BlobType(typ))
+				b2 := idx2.toPackedBlob(e2, restic.BlobType(typ))
+				if b.Length == b2.Length && b.Offset == b2.Offset && b.PackID == b2.PackID {
+					found = true
+				}
+			})
+			return found
+		}
+
+		m2.foreach(func(e2 *indexEntry) bool {
+			if !hasIdenticalEntry(e2) {
+				// packIndex needs to be changed as idx2.pack was appended to idx.pack, see above
+				m.add(e2.id, e2.packIndex+packlen, e2.offset, e2.length)
+			}
 			return true
 		})
 	}

-	idx.packs = append(idx.packs, idx2.packs...)
 	idx.treePacks = append(idx.treePacks, idx2.treePacks...)
 	idx.ids = append(idx.ids, idx2.ids...)
 	idx.supersedes = append(idx.supersedes, idx2.supersedes...)
--- a/internal/repository/master_index_test.go
+++ b/internal/repository/master_index_test.go
@ -1,6 +1,7 @@
 package repository_test

 import (
+	"context"
 	"fmt"
 	"math/rand"
 	"testing"
@ -167,6 +168,14 @@ func TestMasterMergeFinalIndexes(t *testing.T) {
 	rtest.Equals(t, []*repository.Index{idx1, idx2}, finalIndexes)

 	mIdx.MergeFinalIndexes()
+	allIndexes := mIdx.All()
+	rtest.Equals(t, 1, len(allIndexes))
+
+	blobCount := 0
+	for _ = range mIdx.Each(context.TODO()) {
+		blobCount++
+	}
+	rtest.Equals(t, 2, blobCount)

 	blobs := mIdx.Lookup(idInIdx1, restic.DataBlob)
 	rtest.Equals(t, []restic.PackedBlob{blob1}, blobs)
@ -176,6 +185,32 @@ func TestMasterMergeFinalIndexes(t *testing.T) {

 	blobs = mIdx.Lookup(restic.NewRandomID(), restic.DataBlob)
 	rtest.Assert(t, blobs == nil, "Expected no blobs when fetching with a random id")
+
+	// merge another index containing identical blobs
+	idx3 := repository.NewIndex()
+	idx3.Store(blob1)
+	idx3.Store(blob2)
+
+	mIdx.Insert(idx3)
+	finalIndexes = mIdx.FinalizeNotFinalIndexes()
+	rtest.Equals(t, []*repository.Index{idx3}, finalIndexes)
+
+	mIdx.MergeFinalIndexes()
+	allIndexes = mIdx.All()
+	rtest.Equals(t, 1, len(allIndexes))
+
+	// Index should have same entries as before!
+	blobs = mIdx.Lookup(idInIdx1, restic.DataBlob)
+	rtest.Equals(t, []restic.PackedBlob{blob1}, blobs)
+
+	blobs = mIdx.Lookup(idInIdx2, restic.DataBlob)
+	rtest.Equals(t, []restic.PackedBlob{blob2}, blobs)
+
+	blobCount = 0
+	for _ = range mIdx.Each(context.TODO()) {
+		blobCount++
+	}
+	rtest.Equals(t, 2, blobCount)
 }

 func createRandomMasterIndex(rng *rand.Rand, num, size int) (*repository.MasterIndex, restic.ID) {