prune: Enhance treatment of duplicates

This commit is contained in:
Alexander Weiss 2021-02-19 16:57:51 +01:00 committed by Michael Eischer
parent 6cbeb4a9f9
commit 7478cbf70e
2 changed files with 91 additions and 34 deletions

View file

@ -0,0 +1,12 @@
Enhancement: Improve `prune` in presence of duplicate blobs
Restic `prune` always used to repack all pack files containing duplicate
blobs. This effectively removed all duplicates during prune. However, one
of the consequences was that all those pack files were downloadeded and
duplicate blobs did not contribute to the threshold for unused repository
space.
This is now changed and `prune` works nice and fast also if there are lots
of duplicates.
https://github.com/restic/restic/issues/3114
https://github.com/restic/restic/pull/3290

View file

@ -195,13 +195,12 @@ func runPruneWithRepo(opts PruneOptions, gopts GlobalOptions, repo *repository.R
} }
type packInfo struct { type packInfo struct {
usedBlobs uint usedBlobs uint
unusedBlobs uint unusedBlobs uint
duplicateBlobs uint usedSize uint64
usedSize uint64 unusedSize uint64
unusedSize uint64 tpe restic.BlobType
tpe restic.BlobType uncompressed bool
uncompressed bool
} }
type packInfoWithID struct { type packInfoWithID struct {
@ -243,7 +242,7 @@ func prune(opts PruneOptions, gopts GlobalOptions, repo restic.Repository, usedB
Verbosef("searching used packs...\n") Verbosef("searching used packs...\n")
keepBlobs := restic.NewBlobSet() keepBlobs := restic.NewBlobSet()
duplicateBlobs := restic.NewBlobSet() duplicateBlobs := make(map[restic.BlobHandle]uint8)
// iterate over all blobs in index to find out which blobs are duplicates // iterate over all blobs in index to find out which blobs are duplicates
for blob := range repo.Index().Each(ctx) { for blob := range repo.Index().Each(ctx) {
@ -256,7 +255,17 @@ func prune(opts PruneOptions, gopts GlobalOptions, repo restic.Repository, usedB
stats.size.used += size stats.size.used += size
stats.blobs.used++ stats.blobs.used++
case keepBlobs.Has(bh): // duplicate blob case keepBlobs.Has(bh): // duplicate blob
duplicateBlobs.Insert(bh) count, ok := duplicateBlobs[bh]
if !ok {
count = 2 // this one is already the second blob!
} else {
count++
if count == 0 {
// catch uint8 overflow
panic("too many duplicates, prune can only handly up to 255!")
}
}
duplicateBlobs[bh] = count
stats.size.duplicate += size stats.size.duplicate += size
stats.blobs.duplicate++ stats.blobs.duplicate++
default: default:
@ -299,10 +308,9 @@ func prune(opts PruneOptions, gopts GlobalOptions, repo restic.Repository, usedB
bh := blob.BlobHandle bh := blob.BlobHandle
size := uint64(blob.Length) size := uint64(blob.Length)
_, isDuplicate := duplicateBlobs[bh]
switch { switch {
case duplicateBlobs.Has(bh): // duplicate blob case isDuplicate: // duplicate blobs will be handled later
ip.usedSize += size
ip.duplicateBlobs++
case keepBlobs.Has(bh): // used blob, not duplicate case keepBlobs.Has(bh): // used blob, not duplicate
ip.usedSize += size ip.usedSize += size
ip.usedBlobs++ ip.usedBlobs++
@ -317,19 +325,52 @@ func prune(opts PruneOptions, gopts GlobalOptions, repo restic.Repository, usedB
indexPack[blob.PackID] = ip indexPack[blob.PackID] = ip
} }
// if duplicate blobs exist, those will be set to either "used" or "unused":
// - mark only one occurency of duplicate blobs as used
// - if there are already some used blobs in a pack, possibly mark duplicates in this pack as "used"
// - if there are no used blobs in a pack, possibly mark duplicates as "usused"
if len(duplicateBlobs) > 0 {
// iterate again over all blobs in index (this is pretty cheap, all in-mem)
for blob := range repo.Index().Each(ctx) {
bh := blob.BlobHandle
count, isDuplicate := duplicateBlobs[bh]
if !isDuplicate {
continue
}
ip := indexPack[blob.PackID]
size := uint64(blob.Length)
switch {
case count == 0:
// used duplicate exists -> mark as unused
ip.unusedSize += size
ip.unusedBlobs++
case ip.usedBlobs > 0, count == 1:
// other used blobs in pack or "last" occurency -> mark as used
ip.usedSize += size
ip.usedBlobs++
// let other occurences be marked as unused
duplicateBlobs[bh] = 0
default:
// mark as unused and decrease counter
ip.unusedSize += size
ip.unusedBlobs++
duplicateBlobs[bh] = count - 1
}
// update indexPack
indexPack[blob.PackID] = ip
}
}
Verbosef("collecting packs for deletion and repacking\n") Verbosef("collecting packs for deletion and repacking\n")
removePacksFirst := restic.NewIDSet() removePacksFirst := restic.NewIDSet()
removePacks := restic.NewIDSet() removePacks := restic.NewIDSet()
repackPacks := restic.NewIDSet() repackPacks := restic.NewIDSet()
var repackCandidates []packInfoWithID var repackCandidates []packInfoWithID
repackAllPacksWithDuplicates := true
keep := func(p packInfo) { keep := func(p packInfo) {
stats.packs.keep++ stats.packs.keep++
if p.duplicateBlobs > 0 {
repackAllPacksWithDuplicates = false
}
} }
repoVersion := repo.Config().Version repoVersion := repo.Config().Version
@ -347,7 +388,7 @@ func prune(opts PruneOptions, gopts GlobalOptions, repo restic.Repository, usedB
} }
if p.unusedSize+p.usedSize != uint64(packSize) && if p.unusedSize+p.usedSize != uint64(packSize) &&
!(p.usedBlobs == 0 && p.duplicateBlobs == 0) { p.usedBlobs != 0 {
// Pack size does not fit and pack is needed => error // Pack size does not fit and pack is needed => error
// If the pack is not needed, this is no error, the pack can // If the pack is not needed, this is no error, the pack can
// and will be simply removed, see below. // and will be simply removed, see below.
@ -358,7 +399,7 @@ func prune(opts PruneOptions, gopts GlobalOptions, repo restic.Repository, usedB
// statistics // statistics
switch { switch {
case p.usedBlobs == 0 && p.duplicateBlobs == 0: case p.usedBlobs == 0:
stats.packs.unused++ stats.packs.unused++
case p.unusedBlobs == 0: case p.unusedBlobs == 0:
stats.packs.used++ stats.packs.used++
@ -377,7 +418,7 @@ func prune(opts PruneOptions, gopts GlobalOptions, repo restic.Repository, usedB
// decide what to do // decide what to do
switch { switch {
case p.usedBlobs == 0 && p.duplicateBlobs == 0: case p.usedBlobs == 0:
// All blobs in pack are no longer used => remove pack! // All blobs in pack are no longer used => remove pack!
removePacks.Insert(id) removePacks.Insert(id)
stats.blobs.remove += p.unusedBlobs stats.blobs.remove += p.unusedBlobs
@ -387,8 +428,8 @@ func prune(opts PruneOptions, gopts GlobalOptions, repo restic.Repository, usedB
// if this is a data pack and --repack-cacheable-only is set => keep pack! // if this is a data pack and --repack-cacheable-only is set => keep pack!
keep(p) keep(p)
case p.unusedBlobs == 0 && p.duplicateBlobs == 0 && p.tpe != restic.InvalidBlob && !mustCompress: case p.unusedBlobs == 0 && p.tpe != restic.InvalidBlob && !mustCompress:
// All blobs in pack are used and not duplicates/mixed => keep pack! // All blobs in pack are used and not mixed => keep pack!
keep(p) keep(p)
default: default:
@ -410,7 +451,7 @@ func prune(opts PruneOptions, gopts GlobalOptions, repo restic.Repository, usedB
// missing packs that are not needed can be ignored // missing packs that are not needed can be ignored
ignorePacks := restic.NewIDSet() ignorePacks := restic.NewIDSet()
for id, p := range indexPack { for id, p := range indexPack {
if p.usedBlobs == 0 && p.duplicateBlobs == 0 { if p.usedBlobs == 0 {
ignorePacks.Insert(id) ignorePacks.Insert(id)
stats.blobs.remove += p.unusedBlobs stats.blobs.remove += p.unusedBlobs
stats.size.remove += p.unusedSize stats.size.remove += p.unusedSize
@ -439,15 +480,11 @@ func prune(opts PruneOptions, gopts GlobalOptions, repo restic.Repository, usedB
// This is equivalent to sorting by unused / total space. // This is equivalent to sorting by unused / total space.
// Instead of unused[i] / used[i] > unused[j] / used[j] we use // Instead of unused[i] / used[i] > unused[j] / used[j] we use
// unused[i] * used[j] > unused[j] * used[i] as uint32*uint32 < uint64 // unused[i] * used[j] > unused[j] * used[i] as uint32*uint32 < uint64
// Morover duplicates and packs containing trees are sorted to the beginning // Morover packs containing trees are sorted to the beginning
sort.Slice(repackCandidates, func(i, j int) bool { sort.Slice(repackCandidates, func(i, j int) bool {
pi := repackCandidates[i].packInfo pi := repackCandidates[i].packInfo
pj := repackCandidates[j].packInfo pj := repackCandidates[j].packInfo
switch { switch {
case pi.duplicateBlobs > 0 && pj.duplicateBlobs == 0:
return true
case pj.duplicateBlobs > 0 && pi.duplicateBlobs == 0:
return false
case pi.tpe != restic.DataBlob && pj.tpe == restic.DataBlob: case pi.tpe != restic.DataBlob && pj.tpe == restic.DataBlob:
return true return true
case pj.tpe != restic.DataBlob && pi.tpe == restic.DataBlob: case pj.tpe != restic.DataBlob && pi.tpe == restic.DataBlob:
@ -458,7 +495,7 @@ func prune(opts PruneOptions, gopts GlobalOptions, repo restic.Repository, usedB
repack := func(id restic.ID, p packInfo) { repack := func(id restic.ID, p packInfo) {
repackPacks.Insert(id) repackPacks.Insert(id)
stats.blobs.repack += p.unusedBlobs + p.duplicateBlobs + p.usedBlobs stats.blobs.repack += p.unusedBlobs + p.usedBlobs
stats.size.repack += p.unusedSize + p.usedSize stats.size.repack += p.unusedSize + p.usedSize
stats.blobs.repackrm += p.unusedBlobs stats.blobs.repackrm += p.unusedBlobs
stats.size.repackrm += p.unusedSize stats.size.repackrm += p.unusedSize
@ -472,8 +509,8 @@ func prune(opts PruneOptions, gopts GlobalOptions, repo restic.Repository, usedB
case reachedRepackSize: case reachedRepackSize:
keep(p.packInfo) keep(p.packInfo)
case p.duplicateBlobs > 0, p.tpe != restic.DataBlob, p.uncompressed: case p.tpe != restic.DataBlob, p.uncompressed:
// repacking duplicates/non-data/uncompressed-trees is only limited by repackSize // repacking non-data packs / uncompressed-trees is only limited by repackSize
repack(p.ID, p.packInfo) repack(p.ID, p.packInfo)
case reachedUnusedSizeAfter: case reachedUnusedSizeAfter:
@ -485,10 +522,18 @@ func prune(opts PruneOptions, gopts GlobalOptions, repo restic.Repository, usedB
} }
} }
// if all duplicates are repacked, print out correct statistics if len(repackPacks) != 0 {
if repackAllPacksWithDuplicates { // when repacking, we do not want to keep blobs which are
stats.blobs.repackrm += stats.blobs.duplicate // already contained in kept packs, so delete them from keepBlobs
stats.size.repackrm += stats.size.duplicate for blob := range repo.Index().Each(ctx) {
if removePacks.Has(blob.PackID) || repackPacks.Has(blob.PackID) {
continue
}
keepBlobs.Delete(blob.BlobHandle)
}
} else {
// keepBlobs is only needed if packs are repacked
keepBlobs = nil
} }
Verboseff("\nused: %10d blobs / %s\n", stats.blobs.used, formatBytes(stats.size.used)) Verboseff("\nused: %10d blobs / %s\n", stats.blobs.used, formatBytes(stats.size.used))