repository: fix prune heuristic to allow resuming interrupted runs

Pack files created by interrupted prune runs, appear to consist only of
duplicate blobs on the next run. This caused the previous heuristic to
ignore those pack files. Now, a duplicate blob in a specific pack file
is also selected if that pack file only contains duplicate blobs. This
allows prune to select the already rewritten pack files.
This commit is contained in:
Michael Eischer 2024-05-19 23:24:18 +02:00
parent e52033a8bd
commit 027cc64737

View file

@ -72,10 +72,12 @@ type PrunePlan struct {
} }
type packInfo struct { type packInfo struct {
usedBlobs uint usedBlobs uint
unusedBlobs uint unusedBlobs uint
usedSize uint64 duplicateBlobs uint
unusedSize uint64 usedSize uint64
unusedSize uint64
tpe restic.BlobType tpe restic.BlobType
uncompressed bool uncompressed bool
} }
@ -226,6 +228,7 @@ func packInfoFromIndex(ctx context.Context, idx restic.ListBlobser, usedBlobs re
// mark as unused for now, we will later on select one copy // mark as unused for now, we will later on select one copy
ip.unusedSize += size ip.unusedSize += size
ip.unusedBlobs++ ip.unusedBlobs++
ip.duplicateBlobs++
// count as duplicate, will later on change one copy to be counted as used // count as duplicate, will later on change one copy to be counted as used
stats.Size.Duplicate += size stats.Size.Duplicate += size
@ -256,6 +259,8 @@ func packInfoFromIndex(ctx context.Context, idx restic.ListBlobser, usedBlobs re
// if duplicate blobs exist, those will be set to either "used" or "unused": // if duplicate blobs exist, those will be set to either "used" or "unused":
// - mark only one occurrence of duplicate blobs as used // - mark only one occurrence of duplicate blobs as used
// - if there are already some used blobs in a pack, possibly mark duplicates in this pack as "used" // - if there are already some used blobs in a pack, possibly mark duplicates in this pack as "used"
// - if a pack only consists of duplicates (which by definition are used blobs), mark it as "used". This
// ensures that already rewritten packs are kept.
// - if there are no used blobs in a pack, possibly mark duplicates as "unused" // - if there are no used blobs in a pack, possibly mark duplicates as "unused"
if hasDuplicates { if hasDuplicates {
// iterate again over all blobs in index (this is pretty cheap, all in-mem) // iterate again over all blobs in index (this is pretty cheap, all in-mem)
@ -271,8 +276,10 @@ func packInfoFromIndex(ctx context.Context, idx restic.ListBlobser, usedBlobs re
ip := indexPack[blob.PackID] ip := indexPack[blob.PackID]
size := uint64(blob.Length) size := uint64(blob.Length)
switch { switch {
case ip.usedBlobs > 0, count == 0: case ip.usedBlobs > 0, (ip.duplicateBlobs == ip.unusedBlobs), count == 0:
// other used blobs in pack or "last" occurrence -> transition to used // other used blobs in pack, only duplicate blobs or "last" occurrence -> transition to used
// a pack file created by an interrupted prune run will consist of only duplicate blobs
// thus select such already repacked pack files
ip.usedSize += size ip.usedSize += size
ip.usedBlobs++ ip.usedBlobs++
ip.unusedSize -= size ip.unusedSize -= size