Merge pull request #2863 from aawsome/index-no-duplicates

Don't save exact duplicates in merged index
This commit is contained in:
MichaelEischer 2020-08-08 18:24:14 +02:00 committed by GitHub
commit eca0f0ad24
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 66 additions and 5 deletions

View file

@ -464,7 +464,7 @@ func (idx *Index) TreePacks() restic.IDs {
}
// merge() merges indexes, i.e. idx.merge(idx2) merges the contents of idx2 into idx.
// idx2 is not changed by this method.
// During merging exact duplicates are removed; idx2 is not changed by this method.
func (idx *Index) merge(idx2 *Index) error {
idx.m.Lock()
defer idx.m.Unlock()
@ -476,18 +476,35 @@ func (idx *Index) merge(idx2 *Index) error {
}
packlen := len(idx.packs)
// first append packs as they might be accessed when looking for duplicates below
idx.packs = append(idx.packs, idx2.packs...)
// copy all index entries of idx2 to idx
for typ := range idx2.byType {
m2 := &idx2.byType[typ]
m := &idx.byType[typ]
m2.foreach(func(entry *indexEntry) bool {
// packIndex is changed as idx2.pack is appended to idx.pack, see below
m.add(entry.id, entry.packIndex+packlen, entry.offset, entry.length)
// helper func to test if identical entry is contained in idx
hasIdenticalEntry := func(e2 *indexEntry) (found bool) {
m.foreachWithID(e2.id, func(e *indexEntry) {
b := idx.toPackedBlob(e, restic.BlobType(typ))
b2 := idx2.toPackedBlob(e2, restic.BlobType(typ))
if b.Length == b2.Length && b.Offset == b2.Offset && b.PackID == b2.PackID {
found = true
}
})
return found
}
m2.foreach(func(e2 *indexEntry) bool {
if !hasIdenticalEntry(e2) {
// packIndex needs to be changed as idx2.pack was appended to idx.pack, see above
m.add(e2.id, e2.packIndex+packlen, e2.offset, e2.length)
}
return true
})
}
idx.packs = append(idx.packs, idx2.packs...)
idx.treePacks = append(idx.treePacks, idx2.treePacks...)
idx.ids = append(idx.ids, idx2.ids...)
idx.supersedes = append(idx.supersedes, idx2.supersedes...)

View file

@ -1,6 +1,7 @@
package repository_test
import (
"context"
"fmt"
"math/rand"
"testing"
@ -167,6 +168,14 @@ func TestMasterMergeFinalIndexes(t *testing.T) {
rtest.Equals(t, []*repository.Index{idx1, idx2}, finalIndexes)
mIdx.MergeFinalIndexes()
allIndexes := mIdx.All()
rtest.Equals(t, 1, len(allIndexes))
blobCount := 0
for _ = range mIdx.Each(context.TODO()) {
blobCount++
}
rtest.Equals(t, 2, blobCount)
blobs := mIdx.Lookup(idInIdx1, restic.DataBlob)
rtest.Equals(t, []restic.PackedBlob{blob1}, blobs)
@ -176,6 +185,32 @@ func TestMasterMergeFinalIndexes(t *testing.T) {
blobs = mIdx.Lookup(restic.NewRandomID(), restic.DataBlob)
rtest.Assert(t, blobs == nil, "Expected no blobs when fetching with a random id")
// merge another index containing identical blobs
idx3 := repository.NewIndex()
idx3.Store(blob1)
idx3.Store(blob2)
mIdx.Insert(idx3)
finalIndexes = mIdx.FinalizeNotFinalIndexes()
rtest.Equals(t, []*repository.Index{idx3}, finalIndexes)
mIdx.MergeFinalIndexes()
allIndexes = mIdx.All()
rtest.Equals(t, 1, len(allIndexes))
// Index should have same entries as before!
blobs = mIdx.Lookup(idInIdx1, restic.DataBlob)
rtest.Equals(t, []restic.PackedBlob{blob1}, blobs)
blobs = mIdx.Lookup(idInIdx2, restic.DataBlob)
rtest.Equals(t, []restic.PackedBlob{blob2}, blobs)
blobCount = 0
for _ = range mIdx.Each(context.TODO()) {
blobCount++
}
rtest.Equals(t, 2, blobCount)
}
func createRandomMasterIndex(rng *rand.Rand, num, size int) (*repository.MasterIndex, restic.ID) {
@ -193,6 +228,15 @@ func createRandomMasterIndex(rng *rand.Rand, num, size int) (*repository.MasterI
return mIdx, lookupID
}
func BenchmarkMasterIndexAlloc(b *testing.B) {
rng := rand.New(rand.NewSource(0))
b.ReportAllocs()
for i := 0; i < b.N; i++ {
createRandomMasterIndex(rng, 10000, 5)
}
}
func BenchmarkMasterIndexLookupSingleIndex(b *testing.B) {
mIdx, lookupID := createRandomMasterIndex(rand.New(rand.NewSource(0)), 1, 200000)