forked from TrueCloudLab/restic
Merge pull request #3290 from aawsome/prune-handle-duplicates
prune: Handle duplicate blobs more efficiently
This commit is contained in:
commit
d9ea1e9ee2
2 changed files with 94 additions and 44 deletions
12
changelog/unreleased/issue-3114
Normal file
12
changelog/unreleased/issue-3114
Normal file
|
@ -0,0 +1,12 @@
|
||||||
|
Enhancement: Optimize handling of duplicate blobs in `prune`
|
||||||
|
|
||||||
|
Restic `prune` always used to repack all data files containing duplicate
|
||||||
|
blobs. This effectively removed all duplicates during prune. However, as a
|
||||||
|
consequence all these data files were repacked even if the unused repository
|
||||||
|
space threshold could be reached with less work.
|
||||||
|
|
||||||
|
This is now changed and `prune` works nice and fast also if there are lots
|
||||||
|
of duplicates.
|
||||||
|
|
||||||
|
https://github.com/restic/restic/issues/3114
|
||||||
|
https://github.com/restic/restic/pull/3290
|
|
@ -195,13 +195,12 @@ func runPruneWithRepo(opts PruneOptions, gopts GlobalOptions, repo *repository.R
|
||||||
}
|
}
|
||||||
|
|
||||||
type packInfo struct {
|
type packInfo struct {
|
||||||
usedBlobs uint
|
usedBlobs uint
|
||||||
unusedBlobs uint
|
unusedBlobs uint
|
||||||
duplicateBlobs uint
|
usedSize uint64
|
||||||
usedSize uint64
|
unusedSize uint64
|
||||||
unusedSize uint64
|
tpe restic.BlobType
|
||||||
tpe restic.BlobType
|
uncompressed bool
|
||||||
uncompressed bool
|
|
||||||
}
|
}
|
||||||
|
|
||||||
type packInfoWithID struct {
|
type packInfoWithID struct {
|
||||||
|
@ -243,7 +242,7 @@ func prune(opts PruneOptions, gopts GlobalOptions, repo restic.Repository, usedB
|
||||||
Verbosef("searching used packs...\n")
|
Verbosef("searching used packs...\n")
|
||||||
|
|
||||||
keepBlobs := restic.NewBlobSet()
|
keepBlobs := restic.NewBlobSet()
|
||||||
duplicateBlobs := restic.NewBlobSet()
|
duplicateBlobs := make(map[restic.BlobHandle]uint8)
|
||||||
|
|
||||||
// iterate over all blobs in index to find out which blobs are duplicates
|
// iterate over all blobs in index to find out which blobs are duplicates
|
||||||
for blob := range repo.Index().Each(ctx) {
|
for blob := range repo.Index().Each(ctx) {
|
||||||
|
@ -256,7 +255,16 @@ func prune(opts PruneOptions, gopts GlobalOptions, repo restic.Repository, usedB
|
||||||
stats.size.used += size
|
stats.size.used += size
|
||||||
stats.blobs.used++
|
stats.blobs.used++
|
||||||
case keepBlobs.Has(bh): // duplicate blob
|
case keepBlobs.Has(bh): // duplicate blob
|
||||||
duplicateBlobs.Insert(bh)
|
count, ok := duplicateBlobs[bh]
|
||||||
|
if !ok {
|
||||||
|
count = 2 // this one is already the second blob!
|
||||||
|
} else if count < math.MaxUint8 {
|
||||||
|
// don't overflow, but saturate count at 255
|
||||||
|
// this can lead to a non-optimal pack selection, but won't cause
|
||||||
|
// problems otherwise
|
||||||
|
count++
|
||||||
|
}
|
||||||
|
duplicateBlobs[bh] = count
|
||||||
stats.size.duplicate += size
|
stats.size.duplicate += size
|
||||||
stats.blobs.duplicate++
|
stats.blobs.duplicate++
|
||||||
default:
|
default:
|
||||||
|
@ -299,10 +307,9 @@ func prune(opts PruneOptions, gopts GlobalOptions, repo restic.Repository, usedB
|
||||||
|
|
||||||
bh := blob.BlobHandle
|
bh := blob.BlobHandle
|
||||||
size := uint64(blob.Length)
|
size := uint64(blob.Length)
|
||||||
|
_, isDuplicate := duplicateBlobs[bh]
|
||||||
switch {
|
switch {
|
||||||
case duplicateBlobs.Has(bh): // duplicate blob
|
case isDuplicate: // duplicate blobs will be handled later
|
||||||
ip.usedSize += size
|
|
||||||
ip.duplicateBlobs++
|
|
||||||
case keepBlobs.Has(bh): // used blob, not duplicate
|
case keepBlobs.Has(bh): // used blob, not duplicate
|
||||||
ip.usedSize += size
|
ip.usedSize += size
|
||||||
ip.usedBlobs++
|
ip.usedBlobs++
|
||||||
|
@ -317,21 +324,49 @@ func prune(opts PruneOptions, gopts GlobalOptions, repo restic.Repository, usedB
|
||||||
indexPack[blob.PackID] = ip
|
indexPack[blob.PackID] = ip
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// if duplicate blobs exist, those will be set to either "used" or "unused":
|
||||||
|
// - mark only one occurence of duplicate blobs as used
|
||||||
|
// - if there are already some used blobs in a pack, possibly mark duplicates in this pack as "used"
|
||||||
|
// - if there are no used blobs in a pack, possibly mark duplicates as "unused"
|
||||||
|
if len(duplicateBlobs) > 0 {
|
||||||
|
// iterate again over all blobs in index (this is pretty cheap, all in-mem)
|
||||||
|
for blob := range repo.Index().Each(ctx) {
|
||||||
|
bh := blob.BlobHandle
|
||||||
|
count, isDuplicate := duplicateBlobs[bh]
|
||||||
|
if !isDuplicate {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
ip := indexPack[blob.PackID]
|
||||||
|
size := uint64(blob.Length)
|
||||||
|
switch {
|
||||||
|
case count == 0:
|
||||||
|
// used duplicate exists -> mark as unused
|
||||||
|
ip.unusedSize += size
|
||||||
|
ip.unusedBlobs++
|
||||||
|
case ip.usedBlobs > 0, count == 1:
|
||||||
|
// other used blobs in pack or "last" occurency -> mark as used
|
||||||
|
ip.usedSize += size
|
||||||
|
ip.usedBlobs++
|
||||||
|
// let other occurences be marked as unused
|
||||||
|
duplicateBlobs[bh] = 0
|
||||||
|
default:
|
||||||
|
// mark as unused and decrease counter
|
||||||
|
ip.unusedSize += size
|
||||||
|
ip.unusedBlobs++
|
||||||
|
duplicateBlobs[bh] = count - 1
|
||||||
|
}
|
||||||
|
// update indexPack
|
||||||
|
indexPack[blob.PackID] = ip
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
Verbosef("collecting packs for deletion and repacking\n")
|
Verbosef("collecting packs for deletion and repacking\n")
|
||||||
removePacksFirst := restic.NewIDSet()
|
removePacksFirst := restic.NewIDSet()
|
||||||
removePacks := restic.NewIDSet()
|
removePacks := restic.NewIDSet()
|
||||||
repackPacks := restic.NewIDSet()
|
repackPacks := restic.NewIDSet()
|
||||||
|
|
||||||
var repackCandidates []packInfoWithID
|
var repackCandidates []packInfoWithID
|
||||||
repackAllPacksWithDuplicates := true
|
|
||||||
|
|
||||||
keep := func(p packInfo) {
|
|
||||||
stats.packs.keep++
|
|
||||||
if p.duplicateBlobs > 0 {
|
|
||||||
repackAllPacksWithDuplicates = false
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
repoVersion := repo.Config().Version
|
repoVersion := repo.Config().Version
|
||||||
|
|
||||||
// loop over all packs and decide what to do
|
// loop over all packs and decide what to do
|
||||||
|
@ -346,8 +381,7 @@ func prune(opts PruneOptions, gopts GlobalOptions, repo restic.Repository, usedB
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
if p.unusedSize+p.usedSize != uint64(packSize) &&
|
if p.unusedSize+p.usedSize != uint64(packSize) && p.usedBlobs != 0 {
|
||||||
!(p.usedBlobs == 0 && p.duplicateBlobs == 0) {
|
|
||||||
// Pack size does not fit and pack is needed => error
|
// Pack size does not fit and pack is needed => error
|
||||||
// If the pack is not needed, this is no error, the pack can
|
// If the pack is not needed, this is no error, the pack can
|
||||||
// and will be simply removed, see below.
|
// and will be simply removed, see below.
|
||||||
|
@ -358,7 +392,7 @@ func prune(opts PruneOptions, gopts GlobalOptions, repo restic.Repository, usedB
|
||||||
|
|
||||||
// statistics
|
// statistics
|
||||||
switch {
|
switch {
|
||||||
case p.usedBlobs == 0 && p.duplicateBlobs == 0:
|
case p.usedBlobs == 0:
|
||||||
stats.packs.unused++
|
stats.packs.unused++
|
||||||
case p.unusedBlobs == 0:
|
case p.unusedBlobs == 0:
|
||||||
stats.packs.used++
|
stats.packs.used++
|
||||||
|
@ -377,7 +411,7 @@ func prune(opts PruneOptions, gopts GlobalOptions, repo restic.Repository, usedB
|
||||||
|
|
||||||
// decide what to do
|
// decide what to do
|
||||||
switch {
|
switch {
|
||||||
case p.usedBlobs == 0 && p.duplicateBlobs == 0:
|
case p.usedBlobs == 0:
|
||||||
// All blobs in pack are no longer used => remove pack!
|
// All blobs in pack are no longer used => remove pack!
|
||||||
removePacks.Insert(id)
|
removePacks.Insert(id)
|
||||||
stats.blobs.remove += p.unusedBlobs
|
stats.blobs.remove += p.unusedBlobs
|
||||||
|
@ -385,11 +419,11 @@ func prune(opts PruneOptions, gopts GlobalOptions, repo restic.Repository, usedB
|
||||||
|
|
||||||
case opts.RepackCachableOnly && p.tpe == restic.DataBlob:
|
case opts.RepackCachableOnly && p.tpe == restic.DataBlob:
|
||||||
// if this is a data pack and --repack-cacheable-only is set => keep pack!
|
// if this is a data pack and --repack-cacheable-only is set => keep pack!
|
||||||
keep(p)
|
stats.packs.keep++
|
||||||
|
|
||||||
case p.unusedBlobs == 0 && p.duplicateBlobs == 0 && p.tpe != restic.InvalidBlob && !mustCompress:
|
case p.unusedBlobs == 0 && p.tpe != restic.InvalidBlob && !mustCompress:
|
||||||
// All blobs in pack are used and not duplicates/mixed => keep pack!
|
// All blobs in pack are used and not mixed => keep pack!
|
||||||
keep(p)
|
stats.packs.keep++
|
||||||
|
|
||||||
default:
|
default:
|
||||||
// all other packs are candidates for repacking
|
// all other packs are candidates for repacking
|
||||||
|
@ -410,7 +444,7 @@ func prune(opts PruneOptions, gopts GlobalOptions, repo restic.Repository, usedB
|
||||||
// missing packs that are not needed can be ignored
|
// missing packs that are not needed can be ignored
|
||||||
ignorePacks := restic.NewIDSet()
|
ignorePacks := restic.NewIDSet()
|
||||||
for id, p := range indexPack {
|
for id, p := range indexPack {
|
||||||
if p.usedBlobs == 0 && p.duplicateBlobs == 0 {
|
if p.usedBlobs == 0 {
|
||||||
ignorePacks.Insert(id)
|
ignorePacks.Insert(id)
|
||||||
stats.blobs.remove += p.unusedBlobs
|
stats.blobs.remove += p.unusedBlobs
|
||||||
stats.size.remove += p.unusedSize
|
stats.size.remove += p.unusedSize
|
||||||
|
@ -439,15 +473,11 @@ func prune(opts PruneOptions, gopts GlobalOptions, repo restic.Repository, usedB
|
||||||
// This is equivalent to sorting by unused / total space.
|
// This is equivalent to sorting by unused / total space.
|
||||||
// Instead of unused[i] / used[i] > unused[j] / used[j] we use
|
// Instead of unused[i] / used[i] > unused[j] / used[j] we use
|
||||||
// unused[i] * used[j] > unused[j] * used[i] as uint32*uint32 < uint64
|
// unused[i] * used[j] > unused[j] * used[i] as uint32*uint32 < uint64
|
||||||
// Morover duplicates and packs containing trees are sorted to the beginning
|
// Morover packs containing trees are sorted to the beginning
|
||||||
sort.Slice(repackCandidates, func(i, j int) bool {
|
sort.Slice(repackCandidates, func(i, j int) bool {
|
||||||
pi := repackCandidates[i].packInfo
|
pi := repackCandidates[i].packInfo
|
||||||
pj := repackCandidates[j].packInfo
|
pj := repackCandidates[j].packInfo
|
||||||
switch {
|
switch {
|
||||||
case pi.duplicateBlobs > 0 && pj.duplicateBlobs == 0:
|
|
||||||
return true
|
|
||||||
case pj.duplicateBlobs > 0 && pi.duplicateBlobs == 0:
|
|
||||||
return false
|
|
||||||
case pi.tpe != restic.DataBlob && pj.tpe == restic.DataBlob:
|
case pi.tpe != restic.DataBlob && pj.tpe == restic.DataBlob:
|
||||||
return true
|
return true
|
||||||
case pj.tpe != restic.DataBlob && pi.tpe == restic.DataBlob:
|
case pj.tpe != restic.DataBlob && pi.tpe == restic.DataBlob:
|
||||||
|
@ -458,7 +488,7 @@ func prune(opts PruneOptions, gopts GlobalOptions, repo restic.Repository, usedB
|
||||||
|
|
||||||
repack := func(id restic.ID, p packInfo) {
|
repack := func(id restic.ID, p packInfo) {
|
||||||
repackPacks.Insert(id)
|
repackPacks.Insert(id)
|
||||||
stats.blobs.repack += p.unusedBlobs + p.duplicateBlobs + p.usedBlobs
|
stats.blobs.repack += p.unusedBlobs + p.usedBlobs
|
||||||
stats.size.repack += p.unusedSize + p.usedSize
|
stats.size.repack += p.unusedSize + p.usedSize
|
||||||
stats.blobs.repackrm += p.unusedBlobs
|
stats.blobs.repackrm += p.unusedBlobs
|
||||||
stats.size.repackrm += p.unusedSize
|
stats.size.repackrm += p.unusedSize
|
||||||
|
@ -470,25 +500,33 @@ func prune(opts PruneOptions, gopts GlobalOptions, repo restic.Repository, usedB
|
||||||
|
|
||||||
switch {
|
switch {
|
||||||
case reachedRepackSize:
|
case reachedRepackSize:
|
||||||
keep(p.packInfo)
|
stats.packs.keep++
|
||||||
|
|
||||||
case p.duplicateBlobs > 0, p.tpe != restic.DataBlob, p.uncompressed:
|
case p.tpe != restic.DataBlob, p.uncompressed:
|
||||||
// repacking duplicates/non-data/uncompressed-trees is only limited by repackSize
|
// repacking non-data packs / uncompressed-trees is only limited by repackSize
|
||||||
repack(p.ID, p.packInfo)
|
repack(p.ID, p.packInfo)
|
||||||
|
|
||||||
case reachedUnusedSizeAfter:
|
case reachedUnusedSizeAfter:
|
||||||
// for all other packs stop repacking if tolerated unused size is reached.
|
// for all other packs stop repacking if tolerated unused size is reached.
|
||||||
keep(p.packInfo)
|
stats.packs.keep++
|
||||||
|
|
||||||
default:
|
default:
|
||||||
repack(p.ID, p.packInfo)
|
repack(p.ID, p.packInfo)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// if all duplicates are repacked, print out correct statistics
|
if len(repackPacks) != 0 {
|
||||||
if repackAllPacksWithDuplicates {
|
// when repacking, we do not want to keep blobs which are
|
||||||
stats.blobs.repackrm += stats.blobs.duplicate
|
// already contained in kept packs, so delete them from keepBlobs
|
||||||
stats.size.repackrm += stats.size.duplicate
|
for blob := range repo.Index().Each(ctx) {
|
||||||
|
if removePacks.Has(blob.PackID) || repackPacks.Has(blob.PackID) {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
keepBlobs.Delete(blob.BlobHandle)
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// keepBlobs is only needed if packs are repacked
|
||||||
|
keepBlobs = nil
|
||||||
}
|
}
|
||||||
|
|
||||||
Verboseff("\nused: %10d blobs / %s\n", stats.blobs.used, formatBytes(stats.size.used))
|
Verboseff("\nused: %10d blobs / %s\n", stats.blobs.used, formatBytes(stats.size.used))
|
||||||
|
|
Loading…
Reference in a new issue