prune: hide implementation details of counted blob set

This commit is contained in:
Michael Eischer 2023-06-02 20:18:46 +02:00
parent ff4775a15f
commit 93098e9265
4 changed files with 61 additions and 42 deletions

View file

@ -188,7 +188,7 @@ func runPruneWithRepo(ctx context.Context, opts PruneOptions, gopts GlobalOption
RepackUncompressed: opts.RepackUncompressed, RepackUncompressed: opts.RepackUncompressed,
} }
plan, err := repository.PlanPrune(ctx, popts, repo, func(ctx context.Context, repo restic.Repository) (usedBlobs restic.CountedBlobSet, err error) { plan, err := repository.PlanPrune(ctx, popts, repo, func(ctx context.Context, repo restic.Repository) (usedBlobs *restic.CountedBlobSet, err error) {
return getUsedBlobs(ctx, repo, ignoreSnapshots, printer) return getUsedBlobs(ctx, repo, ignoreSnapshots, printer)
}, printer) }, printer)
if err != nil { if err != nil {
@ -255,7 +255,7 @@ func printPruneStats(printer progress.Printer, stats repository.PruneStats) erro
return nil return nil
} }
func getUsedBlobs(ctx context.Context, repo restic.Repository, ignoreSnapshots restic.IDSet, printer progress.Printer) (usedBlobs restic.CountedBlobSet, err error) { func getUsedBlobs(ctx context.Context, repo restic.Repository, ignoreSnapshots restic.IDSet, printer progress.Printer) (usedBlobs *restic.CountedBlobSet, err error) {
var snapshotTrees restic.IDs var snapshotTrees restic.IDs
printer.P("loading all snapshots...\n") printer.P("loading all snapshots...\n")
err = restic.ForAllSnapshots(ctx, repo, repo, ignoreSnapshots, err = restic.ForAllSnapshots(ctx, repo, repo, ignoreSnapshots,

View file

@ -60,11 +60,11 @@ type PruneStats struct {
} }
type PrunePlan struct { type PrunePlan struct {
removePacksFirst restic.IDSet // packs to remove first (unreferenced packs) removePacksFirst restic.IDSet // packs to remove first (unreferenced packs)
repackPacks restic.IDSet // packs to repack repackPacks restic.IDSet // packs to repack
keepBlobs restic.CountedBlobSet // blobs to keep during repacking keepBlobs *restic.CountedBlobSet // blobs to keep during repacking
removePacks restic.IDSet // packs to remove removePacks restic.IDSet // packs to remove
ignorePacks restic.IDSet // packs to ignore when rebuilding the index ignorePacks restic.IDSet // packs to ignore when rebuilding the index
repo *Repository repo *Repository
stats PruneStats stats PruneStats
@ -90,7 +90,7 @@ type packInfoWithID struct {
// PlanPrune selects which files to rewrite and which to delete and which blobs to keep. // PlanPrune selects which files to rewrite and which to delete and which blobs to keep.
// Also some summary statistics are returned. // Also some summary statistics are returned.
func PlanPrune(ctx context.Context, opts PruneOptions, repo *Repository, getUsedBlobs func(ctx context.Context, repo restic.Repository) (usedBlobs restic.CountedBlobSet, err error), printer progress.Printer) (*PrunePlan, error) { func PlanPrune(ctx context.Context, opts PruneOptions, repo *Repository, getUsedBlobs func(ctx context.Context, repo restic.Repository) (usedBlobs *restic.CountedBlobSet, err error), printer progress.Printer) (*PrunePlan, error) {
var stats PruneStats var stats PruneStats
if opts.UnsafeRecovery { if opts.UnsafeRecovery {
@ -152,13 +152,13 @@ func PlanPrune(ctx context.Context, opts PruneOptions, repo *Repository, getUsed
return &plan, nil return &plan, nil
} }
func packInfoFromIndex(ctx context.Context, idx restic.ListBlobser, usedBlobs restic.CountedBlobSet, stats *PruneStats, printer progress.Printer) (restic.CountedBlobSet, map[restic.ID]packInfo, error) { func packInfoFromIndex(ctx context.Context, idx restic.ListBlobser, usedBlobs *restic.CountedBlobSet, stats *PruneStats, printer progress.Printer) (*restic.CountedBlobSet, map[restic.ID]packInfo, error) {
// iterate over all blobs in index to find out which blobs are duplicates // iterate over all blobs in index to find out which blobs are duplicates
// The counter in usedBlobs describes how many instances of the blob exist in the repository index // The counter in usedBlobs describes how many instances of the blob exist in the repository index
// Thus 0 == blob is missing, 1 == blob exists once, >= 2 == duplicates exist // Thus 0 == blob is missing, 1 == blob exists once, >= 2 == duplicates exist
err := idx.ListBlobs(ctx, func(blob restic.PackedBlob) { err := idx.ListBlobs(ctx, func(blob restic.PackedBlob) {
bh := blob.BlobHandle bh := blob.BlobHandle
count, ok := usedBlobs[bh] count, ok := usedBlobs.Get(bh)
if ok { if ok {
if count < math.MaxUint8 { if count < math.MaxUint8 {
// don't overflow, but saturate count at 255 // don't overflow, but saturate count at 255
@ -167,7 +167,7 @@ func packInfoFromIndex(ctx context.Context, idx restic.ListBlobser, usedBlobs re
count++ count++
} }
usedBlobs[bh] = count usedBlobs.Set(bh, count)
} }
}) })
if err != nil { if err != nil {
@ -176,12 +176,12 @@ func packInfoFromIndex(ctx context.Context, idx restic.ListBlobser, usedBlobs re
// Check if all used blobs have been found in index // Check if all used blobs have been found in index
missingBlobs := restic.NewBlobSet() missingBlobs := restic.NewBlobSet()
for bh, count := range usedBlobs { usedBlobs.For(func(bh restic.BlobHandle, count uint8) {
if count == 0 { if count == 0 {
// blob does not exist in any pack files // blob does not exist in any pack files
missingBlobs.Insert(bh) missingBlobs.Insert(bh)
} }
} })
if len(missingBlobs) != 0 { if len(missingBlobs) != 0 {
printer.E("%v not found in the index\n\n"+ printer.E("%v not found in the index\n\n"+
@ -221,7 +221,7 @@ func packInfoFromIndex(ctx context.Context, idx restic.ListBlobser, usedBlobs re
bh := blob.BlobHandle bh := blob.BlobHandle
size := uint64(blob.Length) size := uint64(blob.Length)
dupCount := usedBlobs[bh] dupCount, _ := usedBlobs.Get(bh)
switch { switch {
case dupCount >= 2: case dupCount >= 2:
hasDuplicates = true hasDuplicates = true
@ -266,7 +266,7 @@ func packInfoFromIndex(ctx context.Context, idx restic.ListBlobser, usedBlobs re
// iterate again over all blobs in index (this is pretty cheap, all in-mem) // iterate again over all blobs in index (this is pretty cheap, all in-mem)
err = idx.ListBlobs(ctx, func(blob restic.PackedBlob) { err = idx.ListBlobs(ctx, func(blob restic.PackedBlob) {
bh := blob.BlobHandle bh := blob.BlobHandle
count, ok := usedBlobs[bh] count, ok := usedBlobs.Get(bh)
// skip non-duplicate, aka. normal blobs // skip non-duplicate, aka. normal blobs
// count == 0 is used to mark that this was a duplicate blob with only a single occurrence remaining // count == 0 is used to mark that this was a duplicate blob with only a single occurrence remaining
if !ok || count == 1 { if !ok || count == 1 {
@ -290,7 +290,7 @@ func packInfoFromIndex(ctx context.Context, idx restic.ListBlobser, usedBlobs re
stats.Size.Duplicate -= size stats.Size.Duplicate -= size
stats.Blobs.Duplicate-- stats.Blobs.Duplicate--
// let other occurrences remain marked as unused // let other occurrences remain marked as unused
usedBlobs[bh] = 1 usedBlobs.Set(bh, 1)
default: default:
// remain unused and decrease counter // remain unused and decrease counter
count-- count--
@ -299,7 +299,7 @@ func packInfoFromIndex(ctx context.Context, idx restic.ListBlobser, usedBlobs re
// thus use the special value zero. This will select the last instance of the blob for keeping. // thus use the special value zero. This will select the last instance of the blob for keeping.
count = 0 count = 0
} }
usedBlobs[bh] = count usedBlobs.Set(bh, count)
} }
// update indexPack // update indexPack
indexPack[blob.PackID] = ip indexPack[blob.PackID] = ip
@ -311,11 +311,11 @@ func packInfoFromIndex(ctx context.Context, idx restic.ListBlobser, usedBlobs re
// Sanity check. If no duplicates exist, all blobs have value 1. After handling // Sanity check. If no duplicates exist, all blobs have value 1. After handling
// duplicates, this also applies to duplicates. // duplicates, this also applies to duplicates.
for _, count := range usedBlobs { usedBlobs.For(func(_ restic.BlobHandle, count uint8) {
if count != 1 { if count != 1 {
panic("internal error during blob selection") panic("internal error during blob selection")
} }
} })
return usedBlobs, indexPack, nil return usedBlobs, indexPack, nil
} }
@ -567,7 +567,7 @@ func (plan *PrunePlan) Execute(ctx context.Context, printer progress.Printer) er
// Also remove repacked packs // Also remove repacked packs
plan.removePacks.Merge(plan.repackPacks) plan.removePacks.Merge(plan.repackPacks)
if len(plan.keepBlobs) != 0 { if plan.keepBlobs.Len() != 0 {
printer.E("%v was not repacked\n\n"+ printer.E("%v was not repacked\n\n"+
"Integrity check failed.\n"+ "Integrity check failed.\n"+
"Please report this error (along with the output of the 'prune' run) at\n"+ "Please report this error (along with the output of the 'prune' run) at\n"+

View file

@ -30,7 +30,7 @@ func testPrune(t *testing.T, opts repository.PruneOptions, errOnUnused bool) {
} }
rtest.OK(t, repo.Flush(context.TODO())) rtest.OK(t, repo.Flush(context.TODO()))
plan, err := repository.PlanPrune(context.TODO(), opts, repo, func(ctx context.Context, repo restic.Repository) (usedBlobs restic.CountedBlobSet, err error) { plan, err := repository.PlanPrune(context.TODO(), opts, repo, func(ctx context.Context, repo restic.Repository) (usedBlobs *restic.CountedBlobSet, err error) {
return restic.NewCountedBlobSet(keep.List()...), nil return restic.NewCountedBlobSet(keep.List()...), nil
}, &progress.NoopPrinter{}) }, &progress.NoopPrinter{})
rtest.OK(t, err) rtest.OK(t, err)

View file

@ -5,42 +5,54 @@ import "sort"
// CountedBlobSet is a set of blobs. For each blob it also stores a uint8 value // CountedBlobSet is a set of blobs. For each blob it also stores a uint8 value
// which can be used to track some information. The CountedBlobSet does not use // which can be used to track some information. The CountedBlobSet does not use
// that value in any way. New entries are created with value 0. // that value in any way. New entries are created with value 0.
type CountedBlobSet map[BlobHandle]uint8 type CountedBlobSet struct {
m map[BlobHandle]uint8
}
// NewCountedBlobSet returns a new CountedBlobSet, populated with ids. // NewCountedBlobSet returns a new CountedBlobSet, populated with ids.
func NewCountedBlobSet(handles ...BlobHandle) CountedBlobSet { func NewCountedBlobSet(handles ...BlobHandle) *CountedBlobSet {
m := make(CountedBlobSet) m := CountedBlobSet{}
m.m = make(map[BlobHandle]uint8)
for _, h := range handles { for _, h := range handles {
m[h] = 0 m.m[h] = 0
} }
return m return &m
}
func (s *CountedBlobSet) Get(h BlobHandle) (uint8, bool) {
val, ok := s.m[h]
return val, ok
}
func (s *CountedBlobSet) Set(h BlobHandle, value uint8) {
s.m[h] = value
} }
// Has returns true iff id is contained in the set. // Has returns true iff id is contained in the set.
func (s CountedBlobSet) Has(h BlobHandle) bool { func (s *CountedBlobSet) Has(h BlobHandle) bool {
_, ok := s[h] _, ok := s.m[h]
return ok return ok
} }
// Insert adds id to the set. // Insert adds id to the set.
func (s CountedBlobSet) Insert(h BlobHandle) { func (s *CountedBlobSet) Insert(h BlobHandle) {
s[h] = 0 s.m[h] = 0
} }
// Delete removes id from the set. // Delete removes id from the set.
func (s CountedBlobSet) Delete(h BlobHandle) { func (s *CountedBlobSet) Delete(h BlobHandle) {
delete(s, h) delete(s.m, h)
} }
func (s CountedBlobSet) Len() int { func (s *CountedBlobSet) Len() int {
return len(s) return len(s.m)
} }
// List returns a sorted slice of all BlobHandle in the set. // List returns a sorted slice of all BlobHandle in the set.
func (s CountedBlobSet) List() BlobHandles { func (s *CountedBlobSet) List() BlobHandles {
list := make(BlobHandles, 0, len(s)) list := make(BlobHandles, 0, len(s.m))
for h := range s { for h := range s.m {
list = append(list, h) list = append(list, h)
} }
@ -49,7 +61,7 @@ func (s CountedBlobSet) List() BlobHandles {
return list return list
} }
func (s CountedBlobSet) String() string { func (s *CountedBlobSet) String() string {
str := s.List().String() str := s.List().String()
if len(str) < 2 { if len(str) < 2 {
return "{}" return "{}"
@ -59,10 +71,17 @@ func (s CountedBlobSet) String() string {
} }
// Copy returns a copy of the CountedBlobSet. // Copy returns a copy of the CountedBlobSet.
func (s CountedBlobSet) Copy() CountedBlobSet { func (s *CountedBlobSet) Copy() *CountedBlobSet {
cp := make(CountedBlobSet, len(s)) cp := &CountedBlobSet{}
for k, v := range s { cp.m = make(map[BlobHandle]uint8, len(s.m))
cp[k] = v for k, v := range s.m {
cp.m[k] = v
} }
return cp return cp
} }
func (s *CountedBlobSet) For(cb func(h BlobHandle, value uint8)) {
for k, v := range s.m {
cb(k, v)
}
}