prune: Correctly count used/duplicate blobs for partially compressed repos

Counting the first occurrence of a duplicate blob as used and counting
all other as duplicates, independent of which instance of the blob is
kept, is only accurate if all copies of the blob have the same size. This
is no longer the case for a repository containing both compressed and
uncompressed blobs.

Thus for duplicated blobs first count all instances as duplicates and
then subtract the actually used instance later on.
This commit is contained in:
Michael Eischer 2022-10-22 19:10:33 +02:00
parent b57d42905c
commit 05651d6d4f
2 changed files with 27 additions and 13 deletions

View file

@ -0,0 +1,12 @@
Bugfix: Correct prune statistics for partially compressed repositories
In a partially compressed repository, one data blob can exist both in an
uncompressed and a compressed version. This caused the prune statistics to
become inaccurate and for example report a too high value for the unused size:
> unused size after prune: 16777215.991 TiB
This has been fixed.
https://github.com/restic/restic/issues/3918
https://github.com/restic/restic/pull/3980

View file

@ -306,7 +306,6 @@ func packInfoFromIndex(ctx context.Context, idx restic.MasterIndex, usedBlobs re
// Thus 0 == blob is missing, 1 == blob exists once, >= 2 == duplicates exist
idx.Each(ctx, func(blob restic.PackedBlob) {
bh := blob.BlobHandle
size := uint64(blob.Length)
count, ok := usedBlobs[bh]
if ok {
if count < math.MaxUint8 {
@ -316,19 +315,7 @@ func packInfoFromIndex(ctx context.Context, idx restic.MasterIndex, usedBlobs re
count++
}
if count == 1 {
stats.size.used += size
stats.blobs.used++
} else {
// duplicate if counted more than once
stats.size.duplicate += size
stats.blobs.duplicate++
}
usedBlobs[bh] = count
} else {
stats.size.unused += size
stats.blobs.unused++
}
})
@ -382,12 +369,22 @@ func packInfoFromIndex(ctx context.Context, idx restic.MasterIndex, usedBlobs re
// mark as unused for now, we will later on select one copy
ip.unusedSize += size
ip.unusedBlobs++
// count as duplicate, will later on change one copy to be counted as used
stats.size.duplicate += size
stats.blobs.duplicate++
case dupCount == 1: // used blob, not duplicate
ip.usedSize += size
ip.usedBlobs++
stats.size.used += size
stats.blobs.used++
default: // unused blob
ip.unusedSize += size
ip.unusedBlobs++
stats.size.unused += size
stats.blobs.unused++
}
if !blob.IsCompressed() {
ip.uncompressed = true
@ -420,6 +417,11 @@ func packInfoFromIndex(ctx context.Context, idx restic.MasterIndex, usedBlobs re
ip.usedBlobs++
ip.unusedSize -= size
ip.unusedBlobs--
// same for the global statistics
stats.size.used += size
stats.blobs.used++
stats.size.duplicate -= size
stats.blobs.duplicate--
// let other occurences remain marked as unused
usedBlobs[bh] = 1
default: