prune: move logic into repository package

2024-04-06 19:21:21 +02:00 · 2024-04-06 19:21:21 +02:00 · fc3b548625
commit fc3b548625
parent df9d4b455d
4 changed files with 599 additions and 599 deletions
--- a/cmd/restic/cmd_prune.go
+++ b/cmd/restic/cmd_prune.go
@ -4,14 +4,11 @@ import (
 	"context"
 	"math"
 	"runtime"
-	"sort"
 	"strconv"
 	"strings"

 	"github.com/restic/restic/internal/debug"
 	"github.com/restic/restic/internal/errors"
-	"github.com/restic/restic/internal/index"
-	"github.com/restic/restic/internal/pack"
 	"github.com/restic/restic/internal/repository"
 	"github.com/restic/restic/internal/restic"
 	"github.com/restic/restic/internal/ui"
@ -21,10 +18,6 @@ import (
 	"github.com/spf13/cobra"
 )

-var errorIndexIncomplete = errors.Fatal("index is not complete")
-var errorPacksMissing = errors.Fatal("packs from index missing in repo")
-var errorSizeNotMatching = errors.Fatal("pack size does not match calculated size from index")
-
 var cmdPrune = &cobra.Command{
 	Use:   "prune [flags]",
 	Short: "Remove unneeded data from the repository",
@ -194,14 +187,26 @@ func runPruneWithRepo(ctx context.Context, opts PruneOptions, gopts GlobalOption
 		return err
 	}

-	plan, stats, err := PlanPrune(ctx, opts, repo, func(ctx context.Context, repo restic.Repository) (usedBlobs restic.CountedBlobSet, err error) {
+	popts := repository.PruneOptions{
+		DryRun:         opts.DryRun,
+		UnsafeRecovery: opts.unsafeRecovery,
+
+		MaxUnusedBytes: opts.maxUnusedBytes,
+		MaxRepackBytes: opts.MaxRepackBytes,
+
+		RepackCachableOnly: opts.RepackCachableOnly,
+		RepackSmall:        opts.RepackSmall,
+		RepackUncompressed: opts.RepackUncompressed,
+	}
+
+	plan, stats, err := repository.PlanPrune(ctx, popts, repo, func(ctx context.Context, repo restic.Repository) (usedBlobs restic.CountedBlobSet, err error) {
 		return getUsedBlobs(ctx, repo, ignoreSnapshots, printer)
 	}, printer)
 	if err != nil {
 		return err
 	}

-	if opts.DryRun {
+	if popts.DryRun {
 		printer.P("\nWould have made the following changes:")
 	}

@ -213,459 +218,11 @@ func runPruneWithRepo(ctx context.Context, opts PruneOptions, gopts GlobalOption
 	// Trigger GC to reset garbage collection threshold
 	runtime.GC()

-	return DoPrune(ctx, opts, repo, plan, printer)
-}
-
-type PruneStats struct {
-	Blobs struct {
-		Used      uint
-		Duplicate uint
-		Unused    uint
-		Remove    uint
-		Repack    uint
-		Repackrm  uint
-	}
-	Size struct {
-		Used         uint64
-		Duplicate    uint64
-		Unused       uint64
-		Remove       uint64
-		Repack       uint64
-		Repackrm     uint64
-		Unref        uint64
-		Uncompressed uint64
-	}
-	Packs struct {
-		Used       uint
-		Unused     uint
-		PartlyUsed uint
-		Unref      uint
-		Keep       uint
-		Repack     uint
-		Remove     uint
-	}
-}
-
-type PrunePlan struct {
-	removePacksFirst restic.IDSet          // packs to remove first (unreferenced packs)
-	repackPacks      restic.IDSet          // packs to repack
-	keepBlobs        restic.CountedBlobSet // blobs to keep during repacking
-	removePacks      restic.IDSet          // packs to remove
-	ignorePacks      restic.IDSet          // packs to ignore when rebuilding the index
-}
-
-type packInfo struct {
-	usedBlobs    uint
-	unusedBlobs  uint
-	usedSize     uint64
-	unusedSize   uint64
-	tpe          restic.BlobType
-	uncompressed bool
-}
-
-type packInfoWithID struct {
-	ID restic.ID
-	packInfo
-	mustCompress bool
-}
-
-// PlanPrune selects which files to rewrite and which to delete and which blobs to keep.
-// Also some summary statistics are returned.
-func PlanPrune(ctx context.Context, opts PruneOptions, repo restic.Repository, getUsedBlobs func(ctx context.Context, repo restic.Repository) (usedBlobs restic.CountedBlobSet, err error), printer progress.Printer) (PrunePlan, PruneStats, error) {
-	var stats PruneStats
-
-	usedBlobs, err := getUsedBlobs(ctx, repo)
-	if err != nil {
-		return PrunePlan{}, stats, err
-	}
-
-	printer.P("searching used packs...\n")
-	keepBlobs, indexPack, err := packInfoFromIndex(ctx, repo.Index(), usedBlobs, &stats, printer)
-	if err != nil {
-		return PrunePlan{}, stats, err
-	}
-
-	printer.P("collecting packs for deletion and repacking\n")
-	plan, err := decidePackAction(ctx, opts, repo, indexPack, &stats, printer)
-	if err != nil {
-		return PrunePlan{}, stats, err
-	}
-
-	if len(plan.repackPacks) != 0 {
-		blobCount := keepBlobs.Len()
-		// when repacking, we do not want to keep blobs which are
-		// already contained in kept packs, so delete them from keepBlobs
-		repo.Index().Each(ctx, func(blob restic.PackedBlob) {
-			if plan.removePacks.Has(blob.PackID) || plan.repackPacks.Has(blob.PackID) {
-				return
-			}
-			keepBlobs.Delete(blob.BlobHandle)
-		})
-
-		if keepBlobs.Len() < blobCount/2 {
-			// replace with copy to shrink map to necessary size if there's a chance to benefit
-			keepBlobs = keepBlobs.Copy()
-		}
-	} else {
-		// keepBlobs is only needed if packs are repacked
-		keepBlobs = nil
-	}
-	plan.keepBlobs = keepBlobs
-
-	return plan, stats, nil
-}
-
-func packInfoFromIndex(ctx context.Context, idx restic.MasterIndex, usedBlobs restic.CountedBlobSet, stats *PruneStats, printer progress.Printer) (restic.CountedBlobSet, map[restic.ID]packInfo, error) {
-	// iterate over all blobs in index to find out which blobs are duplicates
-	// The counter in usedBlobs describes how many instances of the blob exist in the repository index
-	// Thus 0 == blob is missing, 1 == blob exists once, >= 2 == duplicates exist
-	idx.Each(ctx, func(blob restic.PackedBlob) {
-		bh := blob.BlobHandle
-		count, ok := usedBlobs[bh]
-		if ok {
-			if count < math.MaxUint8 {
-				// don't overflow, but saturate count at 255
-				// this can lead to a non-optimal pack selection, but won't cause
-				// problems otherwise
-				count++
-			}
-
-			usedBlobs[bh] = count
-		}
-	})
-
-	// Check if all used blobs have been found in index
-	missingBlobs := restic.NewBlobSet()
-	for bh, count := range usedBlobs {
-		if count == 0 {
-			// blob does not exist in any pack files
-			missingBlobs.Insert(bh)
-		}
-	}
-
-	if len(missingBlobs) != 0 {
-		printer.E("%v not found in the index\n\n"+
-			"Integrity check failed: Data seems to be missing.\n"+
-			"Will not start prune to prevent (additional) data loss!\n"+
-			"Please report this error (along with the output of the 'prune' run) at\n"+
-			"https://github.com/restic/restic/issues/new/choose\n", missingBlobs)
-		return nil, nil, errorIndexIncomplete
-	}
-
-	indexPack := make(map[restic.ID]packInfo)
-
-	// save computed pack header size
-	for pid, hdrSize := range pack.Size(ctx, idx, true) {
-		// initialize tpe with NumBlobTypes to indicate it's not set
-		indexPack[pid] = packInfo{tpe: restic.NumBlobTypes, usedSize: uint64(hdrSize)}
-	}
-
-	hasDuplicates := false
-	// iterate over all blobs in index to generate packInfo
-	idx.Each(ctx, func(blob restic.PackedBlob) {
-		ip := indexPack[blob.PackID]
-
-		// Set blob type if not yet set
-		if ip.tpe == restic.NumBlobTypes {
-			ip.tpe = blob.Type
-		}
-
-		// mark mixed packs with "Invalid blob type"
-		if ip.tpe != blob.Type {
-			ip.tpe = restic.InvalidBlob
-		}
-
-		bh := blob.BlobHandle
-		size := uint64(blob.Length)
-		dupCount := usedBlobs[bh]
-		switch {
-		case dupCount >= 2:
-			hasDuplicates = true
-			// mark as unused for now, we will later on select one copy
-			ip.unusedSize += size
-			ip.unusedBlobs++
-
-			// count as duplicate, will later on change one copy to be counted as used
-			stats.Size.Duplicate += size
-			stats.Blobs.Duplicate++
-		case dupCount == 1: // used blob, not duplicate
-			ip.usedSize += size
-			ip.usedBlobs++
-
-			stats.Size.Used += size
-			stats.Blobs.Used++
-		default: // unused blob
-			ip.unusedSize += size
-			ip.unusedBlobs++
-
-			stats.Size.Unused += size
-			stats.Blobs.Unused++
-		}
-		if !blob.IsCompressed() {
-			ip.uncompressed = true
-		}
-		// update indexPack
-		indexPack[blob.PackID] = ip
-	})
-
-	// if duplicate blobs exist, those will be set to either "used" or "unused":
-	// - mark only one occurrence of duplicate blobs as used
-	// - if there are already some used blobs in a pack, possibly mark duplicates in this pack as "used"
-	// - if there are no used blobs in a pack, possibly mark duplicates as "unused"
-	if hasDuplicates {
-		// iterate again over all blobs in index (this is pretty cheap, all in-mem)
-		idx.Each(ctx, func(blob restic.PackedBlob) {
-			bh := blob.BlobHandle
-			count, ok := usedBlobs[bh]
-			// skip non-duplicate, aka. normal blobs
-			// count == 0 is used to mark that this was a duplicate blob with only a single occurrence remaining
-			if !ok || count == 1 {
-				return
-			}
-
-			ip := indexPack[blob.PackID]
-			size := uint64(blob.Length)
-			switch {
-			case ip.usedBlobs > 0, count == 0:
-				// other used blobs in pack or "last" occurrence ->  transition to used
-				ip.usedSize += size
-				ip.usedBlobs++
-				ip.unusedSize -= size
-				ip.unusedBlobs--
-				// same for the global statistics
-				stats.Size.Used += size
-				stats.Blobs.Used++
-				stats.Size.Duplicate -= size
-				stats.Blobs.Duplicate--
-				// let other occurrences remain marked as unused
-				usedBlobs[bh] = 1
-			default:
-				// remain unused and decrease counter
-				count--
-				if count == 1 {
-					// setting count to 1 would lead to forgetting that this blob had duplicates
-					// thus use the special value zero. This will select the last instance of the blob for keeping.
-					count = 0
-				}
-				usedBlobs[bh] = count
-			}
-			// update indexPack
-			indexPack[blob.PackID] = ip
-		})
-	}
-
-	// Sanity check. If no duplicates exist, all blobs have value 1. After handling
-	// duplicates, this also applies to duplicates.
-	for _, count := range usedBlobs {
-		if count != 1 {
-			panic("internal error during blob selection")
-		}
-	}
-
-	return usedBlobs, indexPack, nil
-}
-
-func decidePackAction(ctx context.Context, opts PruneOptions, repo restic.Repository, indexPack map[restic.ID]packInfo, stats *PruneStats, printer progress.Printer) (PrunePlan, error) {
-	removePacksFirst := restic.NewIDSet()
-	removePacks := restic.NewIDSet()
-	repackPacks := restic.NewIDSet()
-
-	var repackCandidates []packInfoWithID
-	var repackSmallCandidates []packInfoWithID
-	repoVersion := repo.Config().Version
-	// only repack very small files by default
-	targetPackSize := repo.PackSize() / 25
-	if opts.RepackSmall {
-		// consider files with at least 80% of the target size as large enough
-		targetPackSize = repo.PackSize() / 5 * 4
-	}
-
-	// loop over all packs and decide what to do
-	bar := printer.NewCounter("packs processed")
-	bar.SetMax(uint64(len(indexPack)))
-	err := repo.List(ctx, restic.PackFile, func(id restic.ID, packSize int64) error {
-		p, ok := indexPack[id]
-		if !ok {
-			// Pack was not referenced in index and is not used  => immediately remove!
-			printer.V("will remove pack %v as it is unused and not indexed\n", id.Str())
-			removePacksFirst.Insert(id)
-			stats.Size.Unref += uint64(packSize)
-			return nil
-		}
-
-		if p.unusedSize+p.usedSize != uint64(packSize) && p.usedBlobs != 0 {
-			// Pack size does not fit and pack is needed => error
-			// If the pack is not needed, this is no error, the pack can
-			// and will be simply removed, see below.
-			printer.E("pack %s: calculated size %d does not match real size %d\nRun 'restic repair index'.\n",
-				id.Str(), p.unusedSize+p.usedSize, packSize)
-			return errorSizeNotMatching
-		}
-
-		// statistics
-		switch {
-		case p.usedBlobs == 0:
-			stats.Packs.Unused++
-		case p.unusedBlobs == 0:
-			stats.Packs.Used++
-		default:
-			stats.Packs.PartlyUsed++
-		}
-
-		if p.uncompressed {
-			stats.Size.Uncompressed += p.unusedSize + p.usedSize
-		}
-		mustCompress := false
-		if repoVersion >= 2 {
-			// repo v2: always repack tree blobs if uncompressed
-			// compress data blobs if requested
-			mustCompress = (p.tpe == restic.TreeBlob || opts.RepackUncompressed) && p.uncompressed
-		}
-
-		// decide what to do
-		switch {
-		case p.usedBlobs == 0:
-			// All blobs in pack are no longer used => remove pack!
-			removePacks.Insert(id)
-			stats.Blobs.Remove += p.unusedBlobs
-			stats.Size.Remove += p.unusedSize
-
-		case opts.RepackCachableOnly && p.tpe == restic.DataBlob:
-			// if this is a data pack and --repack-cacheable-only is set => keep pack!
-			stats.Packs.Keep++
-
-		case p.unusedBlobs == 0 && p.tpe != restic.InvalidBlob && !mustCompress:
-			if packSize >= int64(targetPackSize) {
-				// All blobs in pack are used and not mixed => keep pack!
-				stats.Packs.Keep++
-			} else {
-				repackSmallCandidates = append(repackSmallCandidates, packInfoWithID{ID: id, packInfo: p, mustCompress: mustCompress})
-			}
-
-		default:
-			// all other packs are candidates for repacking
-			repackCandidates = append(repackCandidates, packInfoWithID{ID: id, packInfo: p, mustCompress: mustCompress})
-		}
-
-		delete(indexPack, id)
-		bar.Add(1)
-		return nil
-	})
-	bar.Done()
-	if err != nil {
-		return PrunePlan{}, err
-	}
-
-	// At this point indexPacks contains only missing packs!
-
-	// missing packs that are not needed can be ignored
-	ignorePacks := restic.NewIDSet()
-	for id, p := range indexPack {
-		if p.usedBlobs == 0 {
-			ignorePacks.Insert(id)
-			stats.Blobs.Remove += p.unusedBlobs
-			stats.Size.Remove += p.unusedSize
-			delete(indexPack, id)
-		}
-	}
-
-	if len(indexPack) != 0 {
-		printer.E("The index references %d needed pack files which are missing from the repository:\n", len(indexPack))
-		for id := range indexPack {
-			printer.E("  %v\n", id)
-		}
-		return PrunePlan{}, errorPacksMissing
-	}
-	if len(ignorePacks) != 0 {
-		printer.E("Missing but unneeded pack files are referenced in the index, will be repaired\n")
-		for id := range ignorePacks {
-			printer.E("will forget missing pack file %v\n", id)
-		}
-	}
-
-	if len(repackSmallCandidates) < 10 {
-		// too few small files to be worth the trouble, this also prevents endlessly repacking
-		// if there is just a single pack file below the target size
-		stats.Packs.Keep += uint(len(repackSmallCandidates))
-	} else {
-		repackCandidates = append(repackCandidates, repackSmallCandidates...)
-	}
-
-	// Sort repackCandidates such that packs with highest ratio unused/used space are picked first.
-	// This is equivalent to sorting by unused / total space.
-	// Instead of unused[i] / used[i] > unused[j] / used[j] we use
-	// unused[i] * used[j] > unused[j] * used[i] as uint32*uint32 < uint64
-	// Moreover packs containing trees and too small packs are sorted to the beginning
-	sort.Slice(repackCandidates, func(i, j int) bool {
-		pi := repackCandidates[i].packInfo
-		pj := repackCandidates[j].packInfo
-		switch {
-		case pi.tpe != restic.DataBlob && pj.tpe == restic.DataBlob:
-			return true
-		case pj.tpe != restic.DataBlob && pi.tpe == restic.DataBlob:
-			return false
-		case pi.unusedSize+pi.usedSize < uint64(targetPackSize) && pj.unusedSize+pj.usedSize >= uint64(targetPackSize):
-			return true
-		case pj.unusedSize+pj.usedSize < uint64(targetPackSize) && pi.unusedSize+pi.usedSize >= uint64(targetPackSize):
-			return false
-		}
-		return pi.unusedSize*pj.usedSize > pj.unusedSize*pi.usedSize
-	})
-
-	repack := func(id restic.ID, p packInfo) {
-		repackPacks.Insert(id)
-		stats.Blobs.Repack += p.unusedBlobs + p.usedBlobs
-		stats.Size.Repack += p.unusedSize + p.usedSize
-		stats.Blobs.Repackrm += p.unusedBlobs
-		stats.Size.Repackrm += p.unusedSize
-		if p.uncompressed {
-			stats.Size.Uncompressed -= p.unusedSize + p.usedSize
-		}
-	}
-
-	// calculate limit for number of unused bytes in the repo after repacking
-	maxUnusedSizeAfter := opts.maxUnusedBytes(stats.Size.Used)
-
-	for _, p := range repackCandidates {
-		reachedUnusedSizeAfter := (stats.Size.Unused-stats.Size.Remove-stats.Size.Repackrm < maxUnusedSizeAfter)
-		reachedRepackSize := stats.Size.Repack+p.unusedSize+p.usedSize >= opts.MaxRepackBytes
-		packIsLargeEnough := p.unusedSize+p.usedSize >= uint64(targetPackSize)
-
-		switch {
-		case reachedRepackSize:
-			stats.Packs.Keep++
-
-		case p.tpe != restic.DataBlob, p.mustCompress:
-			// repacking non-data packs / uncompressed-trees is only limited by repackSize
-			repack(p.ID, p.packInfo)
-
-		case reachedUnusedSizeAfter && packIsLargeEnough:
-			// for all other packs stop repacking if tolerated unused size is reached.
-			stats.Packs.Keep++
-
-		default:
-			repack(p.ID, p.packInfo)
-		}
-	}
-
-	stats.Packs.Unref = uint(len(removePacksFirst))
-	stats.Packs.Repack = uint(len(repackPacks))
-	stats.Packs.Remove = uint(len(removePacks))
-
-	if repo.Config().Version < 2 {
-		// compression not supported for repository format version 1
-		stats.Size.Uncompressed = 0
-	}
-
-	return PrunePlan{removePacksFirst: removePacksFirst,
-		removePacks: removePacks,
-		repackPacks: repackPacks,
-		ignorePacks: ignorePacks,
-	}, nil
+	return repository.DoPrune(ctx, popts, repo, plan, printer)
 }

 // printPruneStats prints out the statistics
-func printPruneStats(printer progress.Printer, stats PruneStats) error {
+func printPruneStats(printer progress.Printer, stats repository.PruneStats) error {
 	printer.V("\nused:         %10d blobs / %s\n", stats.Blobs.Used, ui.FormatBytes(stats.Size.Used))
 	if stats.Blobs.Duplicate > 0 {
 		printer.V("duplicates:   %10d blobs / %s\n", stats.Blobs.Duplicate, ui.FormatBytes(stats.Size.Duplicate))
@ -706,107 +263,6 @@ func printPruneStats(printer progress.Printer, stats PruneStats) error {
 	return nil
 }

-// DoPrune does the actual pruning:
-// - remove unreferenced packs first
-// - repack given pack files while keeping the given blobs
-// - rebuild the index while ignoring all files that will be deleted
-// - delete the files
-// plan.removePacks and plan.ignorePacks are modified in this function.
-func DoPrune(ctx context.Context, opts PruneOptions, repo restic.Repository, plan PrunePlan, printer progress.Printer) (err error) {
-	if opts.DryRun {
-		printer.V("Repeated prune dry-runs can report slightly different amounts of data to keep or repack. This is expected behavior.\n\n")
-		if len(plan.removePacksFirst) > 0 {
-			printer.V("Would have removed the following unreferenced packs:\n%v\n\n", plan.removePacksFirst)
-		}
-		printer.V("Would have repacked and removed the following packs:\n%v\n\n", plan.repackPacks)
-		printer.V("Would have removed the following no longer used packs:\n%v\n\n", plan.removePacks)
-		// Always quit here if DryRun was set!
-		return nil
-	}
-
-	// unreferenced packs can be safely deleted first
-	if len(plan.removePacksFirst) != 0 {
-		printer.P("deleting unreferenced packs\n")
-		DeleteFiles(ctx, repo, plan.removePacksFirst, restic.PackFile, printer)
-	}
-
-	if len(plan.repackPacks) != 0 {
-		printer.P("repacking packs\n")
-		bar := printer.NewCounter("packs repacked")
-		bar.SetMax(uint64(len(plan.repackPacks)))
-		_, err := repository.Repack(ctx, repo, repo, plan.repackPacks, plan.keepBlobs, bar)
-		bar.Done()
-		if err != nil {
-			return errors.Fatal(err.Error())
-		}
-
-		// Also remove repacked packs
-		plan.removePacks.Merge(plan.repackPacks)
-
-		if len(plan.keepBlobs) != 0 {
-			printer.E("%v was not repacked\n\n"+
-				"Integrity check failed.\n"+
-				"Please report this error (along with the output of the 'prune' run) at\n"+
-				"https://github.com/restic/restic/issues/new/choose\n", plan.keepBlobs)
-			return errors.Fatal("internal error: blobs were not repacked")
-		}
-
-		// allow GC of the blob set
-		plan.keepBlobs = nil
-	}
-
-	if len(plan.ignorePacks) == 0 {
-		plan.ignorePacks = plan.removePacks
-	} else {
-		plan.ignorePacks.Merge(plan.removePacks)
-	}
-
-	if opts.unsafeRecovery {
-		printer.P("deleting index files\n")
-		indexFiles := repo.Index().(*index.MasterIndex).IDs()
-		err = DeleteFilesChecked(ctx, repo, indexFiles, restic.IndexFile, printer)
-		if err != nil {
-			return errors.Fatalf("%s", err)
-		}
-	} else if len(plan.ignorePacks) != 0 {
-		err = rebuildIndexFiles(ctx, repo, plan.ignorePacks, nil, false, printer)
-		if err != nil {
-			return errors.Fatalf("%s", err)
-		}
-	}
-
-	if len(plan.removePacks) != 0 {
-		printer.P("removing %d old packs\n", len(plan.removePacks))
-		DeleteFiles(ctx, repo, plan.removePacks, restic.PackFile, printer)
-	}
-
-	if opts.unsafeRecovery {
-		err = rebuildIndexFiles(ctx, repo, plan.ignorePacks, nil, true, printer)
-		if err != nil {
-			return errors.Fatalf("%s", err)
-		}
-	}
-
-	printer.P("done\n")
-	return nil
-}
-
-func rebuildIndexFiles(ctx context.Context, repo restic.Repository, removePacks restic.IDSet, extraObsolete restic.IDs, skipDeletion bool, printer progress.Printer) error {
-	printer.P("rebuilding index\n")
-
-	bar := printer.NewCounter("packs processed")
-	return repo.Index().Save(ctx, repo, removePacks, extraObsolete, restic.MasterIndexSaveOpts{
-		SaveProgress: bar,
-		DeleteProgress: func() *progress.Counter {
-			return printer.NewCounter("old indexes deleted")
-		},
-		DeleteReport: func(id restic.ID, _ error) {
-			printer.VV("removed index %v\n", id.String())
-		},
-		SkipDeletion: skipDeletion,
-	})
-}
-
 func getUsedBlobs(ctx context.Context, repo restic.Repository, ignoreSnapshots restic.IDSet, printer progress.Printer) (usedBlobs restic.CountedBlobSet, err error) {
 	var snapshotTrees restic.IDs
 	printer.P("loading all snapshots...\n")
--- a/cmd/restic/cmd_prune_integration_test.go
+++ b/cmd/restic/cmd_prune_integration_test.go
@ -7,6 +7,7 @@ import (
 	"testing"

 	"github.com/restic/restic/internal/backend"
+	"github.com/restic/restic/internal/repository"
 	rtest "github.com/restic/restic/internal/test"
 	"github.com/restic/restic/internal/ui/termstatus"
 )
@ -145,7 +146,7 @@ func TestPruneWithDamagedRepository(t *testing.T) {
 	// prune should fail
 	rtest.Assert(t, withTermStatus(env.gopts, func(ctx context.Context, term *termstatus.Terminal) error {
 		return runPrune(context.TODO(), pruneDefaultOptions, env.gopts, term)
-	}) == errorPacksMissing,
+	}) == repository.ErrPacksMissing,
 		"prune should have reported index not complete error")
 }

--- a/cmd/restic/delete.go
+++ b/cmd/restic/delete.go
@ -1,38 +0,0 @@
-package main
-
-import (
-	"context"
-
-	"github.com/restic/restic/internal/restic"
-	"github.com/restic/restic/internal/ui/progress"
-)
-
-// DeleteFiles deletes the given fileList of fileType in parallel
-// it will print a warning if there is an error, but continue deleting the remaining files
-func DeleteFiles(ctx context.Context, repo restic.Repository, fileList restic.IDSet, fileType restic.FileType, printer progress.Printer) {
-	_ = deleteFiles(ctx, true, repo, fileList, fileType, printer)
-}
-
-// DeleteFilesChecked deletes the given fileList of fileType in parallel
-// if an error occurs, it will cancel and return this error
-func DeleteFilesChecked(ctx context.Context, repo restic.Repository, fileList restic.IDSet, fileType restic.FileType, printer progress.Printer) error {
-	return deleteFiles(ctx, false, repo, fileList, fileType, printer)
-}
-
-// deleteFiles deletes the given fileList of fileType in parallel
-// if ignoreError=true, it will print a warning if there was an error, else it will abort.
-func deleteFiles(ctx context.Context, ignoreError bool, repo restic.Repository, fileList restic.IDSet, fileType restic.FileType, printer progress.Printer) error {
-	bar := printer.NewCounter("files deleted")
-	defer bar.Done()
-
-	return restic.ParallelRemove(ctx, repo, fileList, fileType, func(id restic.ID, err error) error {
-		if err != nil {
-			printer.E("unable to remove %v/%v from the repository\n", fileType, id)
-			if !ignoreError {
-				return err
-			}
-		}
-		printer.VV("removed %v/%v\n", fileType, id)
-		return nil
-	}, bar)
-}
--- a/internal/repository/prune.go
+++ b/internal/repository/prune.go
@ -0,0 +1,581 @@
+package repository
+
+import (
+	"context"
+	"math"
+	"sort"
+
+	"github.com/restic/restic/internal/errors"
+	"github.com/restic/restic/internal/index"
+	"github.com/restic/restic/internal/pack"
+	"github.com/restic/restic/internal/restic"
+	"github.com/restic/restic/internal/ui/progress"
+)
+
+var ErrIndexIncomplete = errors.Fatal("index is not complete")
+var ErrPacksMissing = errors.Fatal("packs from index missing in repo")
+var ErrSizeNotMatching = errors.Fatal("pack size does not match calculated size from index")
+
+// PruneOptions collects all options for the cleanup command.
+type PruneOptions struct {
+	DryRun         bool
+	UnsafeRecovery bool
+
+	MaxUnusedBytes func(used uint64) (unused uint64) // calculates the number of unused bytes after repacking, according to MaxUnused
+	MaxRepackBytes uint64
+
+	RepackCachableOnly bool
+	RepackSmall        bool
+	RepackUncompressed bool
+}
+
+type PruneStats struct {
+	Blobs struct {
+		Used      uint
+		Duplicate uint
+		Unused    uint
+		Remove    uint
+		Repack    uint
+		Repackrm  uint
+	}
+	Size struct {
+		Used         uint64
+		Duplicate    uint64
+		Unused       uint64
+		Remove       uint64
+		Repack       uint64
+		Repackrm     uint64
+		Unref        uint64
+		Uncompressed uint64
+	}
+	Packs struct {
+		Used       uint
+		Unused     uint
+		PartlyUsed uint
+		Unref      uint
+		Keep       uint
+		Repack     uint
+		Remove     uint
+	}
+}
+
+type PrunePlan struct {
+	removePacksFirst restic.IDSet          // packs to remove first (unreferenced packs)
+	repackPacks      restic.IDSet          // packs to repack
+	keepBlobs        restic.CountedBlobSet // blobs to keep during repacking
+	removePacks      restic.IDSet          // packs to remove
+	ignorePacks      restic.IDSet          // packs to ignore when rebuilding the index
+}
+
+type packInfo struct {
+	usedBlobs    uint
+	unusedBlobs  uint
+	usedSize     uint64
+	unusedSize   uint64
+	tpe          restic.BlobType
+	uncompressed bool
+}
+
+type packInfoWithID struct {
+	ID restic.ID
+	packInfo
+	mustCompress bool
+}
+
+// PlanPrune selects which files to rewrite and which to delete and which blobs to keep.
+// Also some summary statistics are returned.
+func PlanPrune(ctx context.Context, opts PruneOptions, repo restic.Repository, getUsedBlobs func(ctx context.Context, repo restic.Repository) (usedBlobs restic.CountedBlobSet, err error), printer progress.Printer) (PrunePlan, PruneStats, error) {
+	var stats PruneStats
+
+	usedBlobs, err := getUsedBlobs(ctx, repo)
+	if err != nil {
+		return PrunePlan{}, stats, err
+	}
+
+	printer.P("searching used packs...\n")
+	keepBlobs, indexPack, err := packInfoFromIndex(ctx, repo.Index(), usedBlobs, &stats, printer)
+	if err != nil {
+		return PrunePlan{}, stats, err
+	}
+
+	printer.P("collecting packs for deletion and repacking\n")
+	plan, err := decidePackAction(ctx, opts, repo, indexPack, &stats, printer)
+	if err != nil {
+		return PrunePlan{}, stats, err
+	}
+
+	if len(plan.repackPacks) != 0 {
+		blobCount := keepBlobs.Len()
+		// when repacking, we do not want to keep blobs which are
+		// already contained in kept packs, so delete them from keepBlobs
+		repo.Index().Each(ctx, func(blob restic.PackedBlob) {
+			if plan.removePacks.Has(blob.PackID) || plan.repackPacks.Has(blob.PackID) {
+				return
+			}
+			keepBlobs.Delete(blob.BlobHandle)
+		})
+
+		if keepBlobs.Len() < blobCount/2 {
+			// replace with copy to shrink map to necessary size if there's a chance to benefit
+			keepBlobs = keepBlobs.Copy()
+		}
+	} else {
+		// keepBlobs is only needed if packs are repacked
+		keepBlobs = nil
+	}
+	plan.keepBlobs = keepBlobs
+
+	return plan, stats, nil
+}
+
+func packInfoFromIndex(ctx context.Context, idx restic.MasterIndex, usedBlobs restic.CountedBlobSet, stats *PruneStats, printer progress.Printer) (restic.CountedBlobSet, map[restic.ID]packInfo, error) {
+	// iterate over all blobs in index to find out which blobs are duplicates
+	// The counter in usedBlobs describes how many instances of the blob exist in the repository index
+	// Thus 0 == blob is missing, 1 == blob exists once, >= 2 == duplicates exist
+	idx.Each(ctx, func(blob restic.PackedBlob) {
+		bh := blob.BlobHandle
+		count, ok := usedBlobs[bh]
+		if ok {
+			if count < math.MaxUint8 {
+				// don't overflow, but saturate count at 255
+				// this can lead to a non-optimal pack selection, but won't cause
+				// problems otherwise
+				count++
+			}
+
+			usedBlobs[bh] = count
+		}
+	})
+
+	// Check if all used blobs have been found in index
+	missingBlobs := restic.NewBlobSet()
+	for bh, count := range usedBlobs {
+		if count == 0 {
+			// blob does not exist in any pack files
+			missingBlobs.Insert(bh)
+		}
+	}
+
+	if len(missingBlobs) != 0 {
+		printer.E("%v not found in the index\n\n"+
+			"Integrity check failed: Data seems to be missing.\n"+
+			"Will not start prune to prevent (additional) data loss!\n"+
+			"Please report this error (along with the output of the 'prune' run) at\n"+
+			"https://github.com/restic/restic/issues/new/choose\n", missingBlobs)
+		return nil, nil, ErrIndexIncomplete
+	}
+
+	indexPack := make(map[restic.ID]packInfo)
+
+	// save computed pack header size
+	for pid, hdrSize := range pack.Size(ctx, idx, true) {
+		// initialize tpe with NumBlobTypes to indicate it's not set
+		indexPack[pid] = packInfo{tpe: restic.NumBlobTypes, usedSize: uint64(hdrSize)}
+	}
+
+	hasDuplicates := false
+	// iterate over all blobs in index to generate packInfo
+	idx.Each(ctx, func(blob restic.PackedBlob) {
+		ip := indexPack[blob.PackID]
+
+		// Set blob type if not yet set
+		if ip.tpe == restic.NumBlobTypes {
+			ip.tpe = blob.Type
+		}
+
+		// mark mixed packs with "Invalid blob type"
+		if ip.tpe != blob.Type {
+			ip.tpe = restic.InvalidBlob
+		}
+
+		bh := blob.BlobHandle
+		size := uint64(blob.Length)
+		dupCount := usedBlobs[bh]
+		switch {
+		case dupCount >= 2:
+			hasDuplicates = true
+			// mark as unused for now, we will later on select one copy
+			ip.unusedSize += size
+			ip.unusedBlobs++
+
+			// count as duplicate, will later on change one copy to be counted as used
+			stats.Size.Duplicate += size
+			stats.Blobs.Duplicate++
+		case dupCount == 1: // used blob, not duplicate
+			ip.usedSize += size
+			ip.usedBlobs++
+
+			stats.Size.Used += size
+			stats.Blobs.Used++
+		default: // unused blob
+			ip.unusedSize += size
+			ip.unusedBlobs++
+
+			stats.Size.Unused += size
+			stats.Blobs.Unused++
+		}
+		if !blob.IsCompressed() {
+			ip.uncompressed = true
+		}
+		// update indexPack
+		indexPack[blob.PackID] = ip
+	})
+
+	// if duplicate blobs exist, those will be set to either "used" or "unused":
+	// - mark only one occurrence of duplicate blobs as used
+	// - if there are already some used blobs in a pack, possibly mark duplicates in this pack as "used"
+	// - if there are no used blobs in a pack, possibly mark duplicates as "unused"
+	if hasDuplicates {
+		// iterate again over all blobs in index (this is pretty cheap, all in-mem)
+		idx.Each(ctx, func(blob restic.PackedBlob) {
+			bh := blob.BlobHandle
+			count, ok := usedBlobs[bh]
+			// skip non-duplicate, aka. normal blobs
+			// count == 0 is used to mark that this was a duplicate blob with only a single occurrence remaining
+			if !ok || count == 1 {
+				return
+			}
+
+			ip := indexPack[blob.PackID]
+			size := uint64(blob.Length)
+			switch {
+			case ip.usedBlobs > 0, count == 0:
+				// other used blobs in pack or "last" occurrence ->  transition to used
+				ip.usedSize += size
+				ip.usedBlobs++
+				ip.unusedSize -= size
+				ip.unusedBlobs--
+				// same for the global statistics
+				stats.Size.Used += size
+				stats.Blobs.Used++
+				stats.Size.Duplicate -= size
+				stats.Blobs.Duplicate--
+				// let other occurrences remain marked as unused
+				usedBlobs[bh] = 1
+			default:
+				// remain unused and decrease counter
+				count--
+				if count == 1 {
+					// setting count to 1 would lead to forgetting that this blob had duplicates
+					// thus use the special value zero. This will select the last instance of the blob for keeping.
+					count = 0
+				}
+				usedBlobs[bh] = count
+			}
+			// update indexPack
+			indexPack[blob.PackID] = ip
+		})
+	}
+
+	// Sanity check. If no duplicates exist, all blobs have value 1. After handling
+	// duplicates, this also applies to duplicates.
+	for _, count := range usedBlobs {
+		if count != 1 {
+			panic("internal error during blob selection")
+		}
+	}
+
+	return usedBlobs, indexPack, nil
+}
+
+func decidePackAction(ctx context.Context, opts PruneOptions, repo restic.Repository, indexPack map[restic.ID]packInfo, stats *PruneStats, printer progress.Printer) (PrunePlan, error) {
+	removePacksFirst := restic.NewIDSet()
+	removePacks := restic.NewIDSet()
+	repackPacks := restic.NewIDSet()
+
+	var repackCandidates []packInfoWithID
+	var repackSmallCandidates []packInfoWithID
+	repoVersion := repo.Config().Version
+	// only repack very small files by default
+	targetPackSize := repo.PackSize() / 25
+	if opts.RepackSmall {
+		// consider files with at least 80% of the target size as large enough
+		targetPackSize = repo.PackSize() / 5 * 4
+	}
+
+	// loop over all packs and decide what to do
+	bar := printer.NewCounter("packs processed")
+	bar.SetMax(uint64(len(indexPack)))
+	err := repo.List(ctx, restic.PackFile, func(id restic.ID, packSize int64) error {
+		p, ok := indexPack[id]
+		if !ok {
+			// Pack was not referenced in index and is not used  => immediately remove!
+			printer.V("will remove pack %v as it is unused and not indexed\n", id.Str())
+			removePacksFirst.Insert(id)
+			stats.Size.Unref += uint64(packSize)
+			return nil
+		}
+
+		if p.unusedSize+p.usedSize != uint64(packSize) && p.usedBlobs != 0 {
+			// Pack size does not fit and pack is needed => error
+			// If the pack is not needed, this is no error, the pack can
+			// and will be simply removed, see below.
+			printer.E("pack %s: calculated size %d does not match real size %d\nRun 'restic repair index'.\n",
+				id.Str(), p.unusedSize+p.usedSize, packSize)
+			return ErrSizeNotMatching
+		}
+
+		// statistics
+		switch {
+		case p.usedBlobs == 0:
+			stats.Packs.Unused++
+		case p.unusedBlobs == 0:
+			stats.Packs.Used++
+		default:
+			stats.Packs.PartlyUsed++
+		}
+
+		if p.uncompressed {
+			stats.Size.Uncompressed += p.unusedSize + p.usedSize
+		}
+		mustCompress := false
+		if repoVersion >= 2 {
+			// repo v2: always repack tree blobs if uncompressed
+			// compress data blobs if requested
+			mustCompress = (p.tpe == restic.TreeBlob || opts.RepackUncompressed) && p.uncompressed
+		}
+
+		// decide what to do
+		switch {
+		case p.usedBlobs == 0:
+			// All blobs in pack are no longer used => remove pack!
+			removePacks.Insert(id)
+			stats.Blobs.Remove += p.unusedBlobs
+			stats.Size.Remove += p.unusedSize
+
+		case opts.RepackCachableOnly && p.tpe == restic.DataBlob:
+			// if this is a data pack and --repack-cacheable-only is set => keep pack!
+			stats.Packs.Keep++
+
+		case p.unusedBlobs == 0 && p.tpe != restic.InvalidBlob && !mustCompress:
+			if packSize >= int64(targetPackSize) {
+				// All blobs in pack are used and not mixed => keep pack!
+				stats.Packs.Keep++
+			} else {
+				repackSmallCandidates = append(repackSmallCandidates, packInfoWithID{ID: id, packInfo: p, mustCompress: mustCompress})
+			}
+
+		default:
+			// all other packs are candidates for repacking
+			repackCandidates = append(repackCandidates, packInfoWithID{ID: id, packInfo: p, mustCompress: mustCompress})
+		}
+
+		delete(indexPack, id)
+		bar.Add(1)
+		return nil
+	})
+	bar.Done()
+	if err != nil {
+		return PrunePlan{}, err
+	}
+
+	// At this point indexPacks contains only missing packs!
+
+	// missing packs that are not needed can be ignored
+	ignorePacks := restic.NewIDSet()
+	for id, p := range indexPack {
+		if p.usedBlobs == 0 {
+			ignorePacks.Insert(id)
+			stats.Blobs.Remove += p.unusedBlobs
+			stats.Size.Remove += p.unusedSize
+			delete(indexPack, id)
+		}
+	}
+
+	if len(indexPack) != 0 {
+		printer.E("The index references %d needed pack files which are missing from the repository:\n", len(indexPack))
+		for id := range indexPack {
+			printer.E("  %v\n", id)
+		}
+		return PrunePlan{}, ErrPacksMissing
+	}
+	if len(ignorePacks) != 0 {
+		printer.E("Missing but unneeded pack files are referenced in the index, will be repaired\n")
+		for id := range ignorePacks {
+			printer.E("will forget missing pack file %v\n", id)
+		}
+	}
+
+	if len(repackSmallCandidates) < 10 {
+		// too few small files to be worth the trouble, this also prevents endlessly repacking
+		// if there is just a single pack file below the target size
+		stats.Packs.Keep += uint(len(repackSmallCandidates))
+	} else {
+		repackCandidates = append(repackCandidates, repackSmallCandidates...)
+	}
+
+	// Sort repackCandidates such that packs with highest ratio unused/used space are picked first.
+	// This is equivalent to sorting by unused / total space.
+	// Instead of unused[i] / used[i] > unused[j] / used[j] we use
+	// unused[i] * used[j] > unused[j] * used[i] as uint32*uint32 < uint64
+	// Moreover packs containing trees and too small packs are sorted to the beginning
+	sort.Slice(repackCandidates, func(i, j int) bool {
+		pi := repackCandidates[i].packInfo
+		pj := repackCandidates[j].packInfo
+		switch {
+		case pi.tpe != restic.DataBlob && pj.tpe == restic.DataBlob:
+			return true
+		case pj.tpe != restic.DataBlob && pi.tpe == restic.DataBlob:
+			return false
+		case pi.unusedSize+pi.usedSize < uint64(targetPackSize) && pj.unusedSize+pj.usedSize >= uint64(targetPackSize):
+			return true
+		case pj.unusedSize+pj.usedSize < uint64(targetPackSize) && pi.unusedSize+pi.usedSize >= uint64(targetPackSize):
+			return false
+		}
+		return pi.unusedSize*pj.usedSize > pj.unusedSize*pi.usedSize
+	})
+
+	repack := func(id restic.ID, p packInfo) {
+		repackPacks.Insert(id)
+		stats.Blobs.Repack += p.unusedBlobs + p.usedBlobs
+		stats.Size.Repack += p.unusedSize + p.usedSize
+		stats.Blobs.Repackrm += p.unusedBlobs
+		stats.Size.Repackrm += p.unusedSize
+		if p.uncompressed {
+			stats.Size.Uncompressed -= p.unusedSize + p.usedSize
+		}
+	}
+
+	// calculate limit for number of unused bytes in the repo after repacking
+	maxUnusedSizeAfter := opts.MaxUnusedBytes(stats.Size.Used)
+
+	for _, p := range repackCandidates {
+		reachedUnusedSizeAfter := (stats.Size.Unused-stats.Size.Remove-stats.Size.Repackrm < maxUnusedSizeAfter)
+		reachedRepackSize := stats.Size.Repack+p.unusedSize+p.usedSize >= opts.MaxRepackBytes
+		packIsLargeEnough := p.unusedSize+p.usedSize >= uint64(targetPackSize)
+
+		switch {
+		case reachedRepackSize:
+			stats.Packs.Keep++
+
+		case p.tpe != restic.DataBlob, p.mustCompress:
+			// repacking non-data packs / uncompressed-trees is only limited by repackSize
+			repack(p.ID, p.packInfo)
+
+		case reachedUnusedSizeAfter && packIsLargeEnough:
+			// for all other packs stop repacking if tolerated unused size is reached.
+			stats.Packs.Keep++
+
+		default:
+			repack(p.ID, p.packInfo)
+		}
+	}
+
+	stats.Packs.Unref = uint(len(removePacksFirst))
+	stats.Packs.Repack = uint(len(repackPacks))
+	stats.Packs.Remove = uint(len(removePacks))
+
+	if repo.Config().Version < 2 {
+		// compression not supported for repository format version 1
+		stats.Size.Uncompressed = 0
+	}
+
+	return PrunePlan{removePacksFirst: removePacksFirst,
+		removePacks: removePacks,
+		repackPacks: repackPacks,
+		ignorePacks: ignorePacks,
+	}, nil
+}
+
+// DoPrune does the actual pruning:
+// - remove unreferenced packs first
+// - repack given pack files while keeping the given blobs
+// - rebuild the index while ignoring all files that will be deleted
+// - delete the files
+// plan.removePacks and plan.ignorePacks are modified in this function.
+func DoPrune(ctx context.Context, opts PruneOptions, repo restic.Repository, plan PrunePlan, printer progress.Printer) (err error) {
+	if opts.DryRun {
+		printer.V("Repeated prune dry-runs can report slightly different amounts of data to keep or repack. This is expected behavior.\n\n")
+		if len(plan.removePacksFirst) > 0 {
+			printer.V("Would have removed the following unreferenced packs:\n%v\n\n", plan.removePacksFirst)
+		}
+		printer.V("Would have repacked and removed the following packs:\n%v\n\n", plan.repackPacks)
+		printer.V("Would have removed the following no longer used packs:\n%v\n\n", plan.removePacks)
+		// Always quit here if DryRun was set!
+		return nil
+	}
+
+	// unreferenced packs can be safely deleted first
+	if len(plan.removePacksFirst) != 0 {
+		printer.P("deleting unreferenced packs\n")
+		_ = deleteFiles(ctx, true, repo, plan.removePacksFirst, restic.PackFile, printer)
+	}
+
+	if len(plan.repackPacks) != 0 {
+		printer.P("repacking packs\n")
+		bar := printer.NewCounter("packs repacked")
+		bar.SetMax(uint64(len(plan.repackPacks)))
+		_, err := Repack(ctx, repo, repo, plan.repackPacks, plan.keepBlobs, bar)
+		bar.Done()
+		if err != nil {
+			return errors.Fatal(err.Error())
+		}
+
+		// Also remove repacked packs
+		plan.removePacks.Merge(plan.repackPacks)
+
+		if len(plan.keepBlobs) != 0 {
+			printer.E("%v was not repacked\n\n"+
+				"Integrity check failed.\n"+
+				"Please report this error (along with the output of the 'prune' run) at\n"+
+				"https://github.com/restic/restic/issues/new/choose\n", plan.keepBlobs)
+			return errors.Fatal("internal error: blobs were not repacked")
+		}
+
+		// allow GC of the blob set
+		plan.keepBlobs = nil
+	}
+
+	if len(plan.ignorePacks) == 0 {
+		plan.ignorePacks = plan.removePacks
+	} else {
+		plan.ignorePacks.Merge(plan.removePacks)
+	}
+
+	if opts.UnsafeRecovery {
+		printer.P("deleting index files\n")
+		indexFiles := repo.Index().(*index.MasterIndex).IDs()
+		err = deleteFiles(ctx, false, repo, indexFiles, restic.IndexFile, printer)
+		if err != nil {
+			return errors.Fatalf("%s", err)
+		}
+	} else if len(plan.ignorePacks) != 0 {
+		err = rebuildIndexFiles(ctx, repo, plan.ignorePacks, nil, false, printer)
+		if err != nil {
+			return errors.Fatalf("%s", err)
+		}
+	}
+
+	if len(plan.removePacks) != 0 {
+		printer.P("removing %d old packs\n", len(plan.removePacks))
+		_ = deleteFiles(ctx, true, repo, plan.removePacks, restic.PackFile, printer)
+	}
+
+	if opts.UnsafeRecovery {
+		err = rebuildIndexFiles(ctx, repo, plan.ignorePacks, nil, true, printer)
+		if err != nil {
+			return errors.Fatalf("%s", err)
+		}
+	}
+
+	printer.P("done\n")
+	return nil
+}
+
+// deleteFiles deletes the given fileList of fileType in parallel
+// if ignoreError=true, it will print a warning if there was an error, else it will abort.
+func deleteFiles(ctx context.Context, ignoreError bool, repo restic.Repository, fileList restic.IDSet, fileType restic.FileType, printer progress.Printer) error {
+	bar := printer.NewCounter("files deleted")
+	defer bar.Done()
+
+	return restic.ParallelRemove(ctx, repo, fileList, fileType, func(id restic.ID, err error) error {
+		if err != nil {
+			printer.E("unable to remove %v/%v from the repository\n", fileType, id)
+			if !ignoreError {
+				return err
+			}
+		}
+		printer.VV("removed %v/%v\n", fileType, id)
+		return nil
+	}, bar)
+}