forked from TrueCloudLab/restic
prune: handle very high duplication of some blobs
Suggested-By: Alexander Weiss <alex@weissfam.de>
This commit is contained in:
parent
7478cbf70e
commit
9be1bd2acc
2 changed files with 12 additions and 13 deletions
|
@ -1,10 +1,10 @@
|
|||
Enhancement: Improve `prune` in presence of duplicate blobs
|
||||
Enhancement: Optimize handling of duplicate blobs in `prune`
|
||||
|
||||
Restic `prune` always used to repack all data files containing duplicate
|
||||
blobs. This effectively removed all duplicates during prune. However, as a
|
||||
consequence all these data files were repacked even if the unused repository
|
||||
space threshold could be reached with less work.
|
||||
|
||||
Restic `prune` always used to repack all pack files containing duplicate
|
||||
blobs. This effectively removed all duplicates during prune. However, one
|
||||
of the consequences was that all those pack files were downloadeded and
|
||||
duplicate blobs did not contribute to the threshold for unused repository
|
||||
space.
|
||||
This is now changed and `prune` works nice and fast also if there are lots
|
||||
of duplicates.
|
||||
|
||||
|
|
|
@ -258,12 +258,11 @@ func prune(opts PruneOptions, gopts GlobalOptions, repo restic.Repository, usedB
|
|||
count, ok := duplicateBlobs[bh]
|
||||
if !ok {
|
||||
count = 2 // this one is already the second blob!
|
||||
} else {
|
||||
} else if count < math.MaxUint8 {
|
||||
// don't overflow, but saturate count at 255
|
||||
// this can lead to a non-optimal pack selection, but won't cause
|
||||
// problems otherwise
|
||||
count++
|
||||
if count == 0 {
|
||||
// catch uint8 overflow
|
||||
panic("too many duplicates, prune can only handly up to 255!")
|
||||
}
|
||||
}
|
||||
duplicateBlobs[bh] = count
|
||||
stats.size.duplicate += size
|
||||
|
@ -326,9 +325,9 @@ func prune(opts PruneOptions, gopts GlobalOptions, repo restic.Repository, usedB
|
|||
}
|
||||
|
||||
// if duplicate blobs exist, those will be set to either "used" or "unused":
|
||||
// - mark only one occurency of duplicate blobs as used
|
||||
// - mark only one occurence of duplicate blobs as used
|
||||
// - if there are already some used blobs in a pack, possibly mark duplicates in this pack as "used"
|
||||
// - if there are no used blobs in a pack, possibly mark duplicates as "usused"
|
||||
// - if there are no used blobs in a pack, possibly mark duplicates as "unused"
|
||||
if len(duplicateBlobs) > 0 {
|
||||
// iterate again over all blobs in index (this is pretty cheap, all in-mem)
|
||||
for blob := range repo.Index().Each(ctx) {
|
||||
|
|
Loading…
Reference in a new issue