prune: Enhance treatment of duplicates

This commit is contained in:
Alexander Weiss 2021-02-19 16:57:51 +01:00 committed by Michael Eischer
parent 6cbeb4a9f9
commit 7478cbf70e
2 changed files with 91 additions and 34 deletions

View file

@ -0,0 +1,12 @@
Enhancement: Improve `prune` in presence of duplicate blobs
Restic `prune` always used to repack all pack files containing duplicate
blobs. This effectively removed all duplicates during prune. However, one
of the consequences was that all those pack files were downloadeded and
duplicate blobs did not contribute to the threshold for unused repository
space.
This is now changed and `prune` works nice and fast also if there are lots
of duplicates.
https://github.com/restic/restic/issues/3114
https://github.com/restic/restic/pull/3290

View file

@ -195,13 +195,12 @@ func runPruneWithRepo(opts PruneOptions, gopts GlobalOptions, repo *repository.R
}
type packInfo struct {
usedBlobs uint
unusedBlobs uint
duplicateBlobs uint
usedSize uint64
unusedSize uint64
tpe restic.BlobType
uncompressed bool
usedBlobs uint
unusedBlobs uint
usedSize uint64
unusedSize uint64
tpe restic.BlobType
uncompressed bool
}
type packInfoWithID struct {
@ -243,7 +242,7 @@ func prune(opts PruneOptions, gopts GlobalOptions, repo restic.Repository, usedB
Verbosef("searching used packs...\n")
keepBlobs := restic.NewBlobSet()
duplicateBlobs := restic.NewBlobSet()
duplicateBlobs := make(map[restic.BlobHandle]uint8)
// iterate over all blobs in index to find out which blobs are duplicates
for blob := range repo.Index().Each(ctx) {
@ -256,7 +255,17 @@ func prune(opts PruneOptions, gopts GlobalOptions, repo restic.Repository, usedB
stats.size.used += size
stats.blobs.used++
case keepBlobs.Has(bh): // duplicate blob
duplicateBlobs.Insert(bh)
count, ok := duplicateBlobs[bh]
if !ok {
count = 2 // this one is already the second blob!
} else {
count++
if count == 0 {
// catch uint8 overflow
panic("too many duplicates, prune can only handly up to 255!")
}
}
duplicateBlobs[bh] = count
stats.size.duplicate += size
stats.blobs.duplicate++
default:
@ -299,10 +308,9 @@ func prune(opts PruneOptions, gopts GlobalOptions, repo restic.Repository, usedB
bh := blob.BlobHandle
size := uint64(blob.Length)
_, isDuplicate := duplicateBlobs[bh]
switch {
case duplicateBlobs.Has(bh): // duplicate blob
ip.usedSize += size
ip.duplicateBlobs++
case isDuplicate: // duplicate blobs will be handled later
case keepBlobs.Has(bh): // used blob, not duplicate
ip.usedSize += size
ip.usedBlobs++
@ -317,19 +325,52 @@ func prune(opts PruneOptions, gopts GlobalOptions, repo restic.Repository, usedB
indexPack[blob.PackID] = ip
}
// if duplicate blobs exist, those will be set to either "used" or "unused":
// - mark only one occurency of duplicate blobs as used
// - if there are already some used blobs in a pack, possibly mark duplicates in this pack as "used"
// - if there are no used blobs in a pack, possibly mark duplicates as "usused"
if len(duplicateBlobs) > 0 {
// iterate again over all blobs in index (this is pretty cheap, all in-mem)
for blob := range repo.Index().Each(ctx) {
bh := blob.BlobHandle
count, isDuplicate := duplicateBlobs[bh]
if !isDuplicate {
continue
}
ip := indexPack[blob.PackID]
size := uint64(blob.Length)
switch {
case count == 0:
// used duplicate exists -> mark as unused
ip.unusedSize += size
ip.unusedBlobs++
case ip.usedBlobs > 0, count == 1:
// other used blobs in pack or "last" occurency -> mark as used
ip.usedSize += size
ip.usedBlobs++
// let other occurences be marked as unused
duplicateBlobs[bh] = 0
default:
// mark as unused and decrease counter
ip.unusedSize += size
ip.unusedBlobs++
duplicateBlobs[bh] = count - 1
}
// update indexPack
indexPack[blob.PackID] = ip
}
}
Verbosef("collecting packs for deletion and repacking\n")
removePacksFirst := restic.NewIDSet()
removePacks := restic.NewIDSet()
repackPacks := restic.NewIDSet()
var repackCandidates []packInfoWithID
repackAllPacksWithDuplicates := true
keep := func(p packInfo) {
stats.packs.keep++
if p.duplicateBlobs > 0 {
repackAllPacksWithDuplicates = false
}
}
repoVersion := repo.Config().Version
@ -347,7 +388,7 @@ func prune(opts PruneOptions, gopts GlobalOptions, repo restic.Repository, usedB
}
if p.unusedSize+p.usedSize != uint64(packSize) &&
!(p.usedBlobs == 0 && p.duplicateBlobs == 0) {
p.usedBlobs != 0 {
// Pack size does not fit and pack is needed => error
// If the pack is not needed, this is no error, the pack can
// and will be simply removed, see below.
@ -358,7 +399,7 @@ func prune(opts PruneOptions, gopts GlobalOptions, repo restic.Repository, usedB
// statistics
switch {
case p.usedBlobs == 0 && p.duplicateBlobs == 0:
case p.usedBlobs == 0:
stats.packs.unused++
case p.unusedBlobs == 0:
stats.packs.used++
@ -377,7 +418,7 @@ func prune(opts PruneOptions, gopts GlobalOptions, repo restic.Repository, usedB
// decide what to do
switch {
case p.usedBlobs == 0 && p.duplicateBlobs == 0:
case p.usedBlobs == 0:
// All blobs in pack are no longer used => remove pack!
removePacks.Insert(id)
stats.blobs.remove += p.unusedBlobs
@ -387,8 +428,8 @@ func prune(opts PruneOptions, gopts GlobalOptions, repo restic.Repository, usedB
// if this is a data pack and --repack-cacheable-only is set => keep pack!
keep(p)
case p.unusedBlobs == 0 && p.duplicateBlobs == 0 && p.tpe != restic.InvalidBlob && !mustCompress:
// All blobs in pack are used and not duplicates/mixed => keep pack!
case p.unusedBlobs == 0 && p.tpe != restic.InvalidBlob && !mustCompress:
// All blobs in pack are used and not mixed => keep pack!
keep(p)
default:
@ -410,7 +451,7 @@ func prune(opts PruneOptions, gopts GlobalOptions, repo restic.Repository, usedB
// missing packs that are not needed can be ignored
ignorePacks := restic.NewIDSet()
for id, p := range indexPack {
if p.usedBlobs == 0 && p.duplicateBlobs == 0 {
if p.usedBlobs == 0 {
ignorePacks.Insert(id)
stats.blobs.remove += p.unusedBlobs
stats.size.remove += p.unusedSize
@ -439,15 +480,11 @@ func prune(opts PruneOptions, gopts GlobalOptions, repo restic.Repository, usedB
// This is equivalent to sorting by unused / total space.
// Instead of unused[i] / used[i] > unused[j] / used[j] we use
// unused[i] * used[j] > unused[j] * used[i] as uint32*uint32 < uint64
// Morover duplicates and packs containing trees are sorted to the beginning
// Morover packs containing trees are sorted to the beginning
sort.Slice(repackCandidates, func(i, j int) bool {
pi := repackCandidates[i].packInfo
pj := repackCandidates[j].packInfo
switch {
case pi.duplicateBlobs > 0 && pj.duplicateBlobs == 0:
return true
case pj.duplicateBlobs > 0 && pi.duplicateBlobs == 0:
return false
case pi.tpe != restic.DataBlob && pj.tpe == restic.DataBlob:
return true
case pj.tpe != restic.DataBlob && pi.tpe == restic.DataBlob:
@ -458,7 +495,7 @@ func prune(opts PruneOptions, gopts GlobalOptions, repo restic.Repository, usedB
repack := func(id restic.ID, p packInfo) {
repackPacks.Insert(id)
stats.blobs.repack += p.unusedBlobs + p.duplicateBlobs + p.usedBlobs
stats.blobs.repack += p.unusedBlobs + p.usedBlobs
stats.size.repack += p.unusedSize + p.usedSize
stats.blobs.repackrm += p.unusedBlobs
stats.size.repackrm += p.unusedSize
@ -472,8 +509,8 @@ func prune(opts PruneOptions, gopts GlobalOptions, repo restic.Repository, usedB
case reachedRepackSize:
keep(p.packInfo)
case p.duplicateBlobs > 0, p.tpe != restic.DataBlob, p.uncompressed:
// repacking duplicates/non-data/uncompressed-trees is only limited by repackSize
case p.tpe != restic.DataBlob, p.uncompressed:
// repacking non-data packs / uncompressed-trees is only limited by repackSize
repack(p.ID, p.packInfo)
case reachedUnusedSizeAfter:
@ -485,10 +522,18 @@ func prune(opts PruneOptions, gopts GlobalOptions, repo restic.Repository, usedB
}
}
// if all duplicates are repacked, print out correct statistics
if repackAllPacksWithDuplicates {
stats.blobs.repackrm += stats.blobs.duplicate
stats.size.repackrm += stats.size.duplicate
if len(repackPacks) != 0 {
// when repacking, we do not want to keep blobs which are
// already contained in kept packs, so delete them from keepBlobs
for blob := range repo.Index().Each(ctx) {
if removePacks.Has(blob.PackID) || repackPacks.Has(blob.PackID) {
continue
}
keepBlobs.Delete(blob.BlobHandle)
}
} else {
// keepBlobs is only needed if packs are repacked
keepBlobs = nil
}
Verboseff("\nused: %10d blobs / %s\n", stats.blobs.used, formatBytes(stats.size.used))