Merge pull request #3980 from MichaelEischer/prune-compression-stats

prune: Correctly count used/duplicate blobs for partially compressed repos
This commit is contained in:
Michael Eischer 2022-11-12 20:06:56 +01:00 committed by GitHub
commit 66818a8f98
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 28 additions and 13 deletions

View file

@ -0,0 +1,12 @@
Bugfix: Correct prune statistics for partially compressed repositories
In a partially compressed repository, one data blob can exist both in an
uncompressed and a compressed version. This caused the prune statistics to
become inaccurate and for example report a too high value for the unused size:
> unused size after prune: 16777215.991 TiB
This has been fixed.
https://github.com/restic/restic/issues/3918
https://github.com/restic/restic/pull/3980

View file

@ -307,7 +307,6 @@ func packInfoFromIndex(ctx context.Context, idx restic.MasterIndex, usedBlobs re
// Thus 0 == blob is missing, 1 == blob exists once, >= 2 == duplicates exist // Thus 0 == blob is missing, 1 == blob exists once, >= 2 == duplicates exist
idx.Each(ctx, func(blob restic.PackedBlob) { idx.Each(ctx, func(blob restic.PackedBlob) {
bh := blob.BlobHandle bh := blob.BlobHandle
size := uint64(blob.Length)
count, ok := usedBlobs[bh] count, ok := usedBlobs[bh]
if ok { if ok {
if count < math.MaxUint8 { if count < math.MaxUint8 {
@ -317,19 +316,7 @@ func packInfoFromIndex(ctx context.Context, idx restic.MasterIndex, usedBlobs re
count++ count++
} }
if count == 1 {
stats.size.used += size
stats.blobs.used++
} else {
// duplicate if counted more than once
stats.size.duplicate += size
stats.blobs.duplicate++
}
usedBlobs[bh] = count usedBlobs[bh] = count
} else {
stats.size.unused += size
stats.blobs.unused++
} }
}) })
@ -383,12 +370,22 @@ func packInfoFromIndex(ctx context.Context, idx restic.MasterIndex, usedBlobs re
// mark as unused for now, we will later on select one copy // mark as unused for now, we will later on select one copy
ip.unusedSize += size ip.unusedSize += size
ip.unusedBlobs++ ip.unusedBlobs++
// count as duplicate, will later on change one copy to be counted as used
stats.size.duplicate += size
stats.blobs.duplicate++
case dupCount == 1: // used blob, not duplicate case dupCount == 1: // used blob, not duplicate
ip.usedSize += size ip.usedSize += size
ip.usedBlobs++ ip.usedBlobs++
stats.size.used += size
stats.blobs.used++
default: // unused blob default: // unused blob
ip.unusedSize += size ip.unusedSize += size
ip.unusedBlobs++ ip.unusedBlobs++
stats.size.unused += size
stats.blobs.unused++
} }
if !blob.IsCompressed() { if !blob.IsCompressed() {
ip.uncompressed = true ip.uncompressed = true
@ -421,6 +418,11 @@ func packInfoFromIndex(ctx context.Context, idx restic.MasterIndex, usedBlobs re
ip.usedBlobs++ ip.usedBlobs++
ip.unusedSize -= size ip.unusedSize -= size
ip.unusedBlobs-- ip.unusedBlobs--
// same for the global statistics
stats.size.used += size
stats.blobs.used++
stats.size.duplicate -= size
stats.blobs.duplicate--
// let other occurences remain marked as unused // let other occurences remain marked as unused
usedBlobs[bh] = 1 usedBlobs[bh] = 1
default: default:
@ -686,6 +688,7 @@ func printPruneStats(gopts GlobalOptions, stats pruneStats) error {
func doPrune(ctx context.Context, opts PruneOptions, gopts GlobalOptions, repo restic.Repository, plan prunePlan) (err error) { func doPrune(ctx context.Context, opts PruneOptions, gopts GlobalOptions, repo restic.Repository, plan prunePlan) (err error) {
if opts.DryRun { if opts.DryRun {
if !gopts.JSON && gopts.verbosity >= 2 { if !gopts.JSON && gopts.verbosity >= 2 {
Printf("Repeated prune dry-runs can report slightly different amounts of data to keep or repack. This is expected behavior.\n\n")
if len(plan.removePacksFirst) > 0 { if len(plan.removePacksFirst) > 0 {
Printf("Would have removed the following unreferenced packs:\n%v\n\n", plan.removePacksFirst) Printf("Would have removed the following unreferenced packs:\n%v\n\n", plan.removePacksFirst)
} }