Merge pull request #2718 from aawsome/new-cleanup-command

Reimplementation of prune
This commit is contained in:
Alexander Neumann 2020-11-05 10:12:19 +01:00 committed by GitHub
commit 5144141321
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
10 changed files with 607 additions and 216 deletions

View file

@ -0,0 +1,22 @@
Enhancement: Improve pruning performance and make pruning more customizable
The `prune` command is now much faster. This is especially the case for remote
repositories or repositories with not much data to remove.
Also the memory usage of the `prune` command is now reduced.
By default, the `prune` command no longer removes all unused data. This
behavior can be fine-tuned by new options, like the acceptable amount of unused space or
the maximum size of data to reorganize. For more details, see
https://restic.readthedocs.io/en/stable/060_forget.html
Moreover, `prune` now accepts the `--dry-run` option and `forget --dry-run --prune`
also shows what `prune` would do.
Fixes several open issues, e.g.:
https://github.com/restic/restic/issues/1140
https://github.com/restic/restic/issues/1985
https://github.com/restic/restic/issues/2112
https://github.com/restic/restic/issues/2227
https://github.com/restic/restic/issues/2305
https://github.com/restic/restic/pull/2718

View file

@ -80,9 +80,15 @@ func init() {
f.BoolVar(&forgetOptions.Prune, "prune", false, "automatically run the 'prune' command if snapshots have been removed")
f.SortFlags = false
addPruneOptions(cmdForget)
}
func runForget(opts ForgetOptions, gopts GlobalOptions, args []string) error {
err := verifyPruneOptions(&pruneOptions)
if err != nil {
return err
}
repo, err := OpenRepository(gopts)
if err != nil {
return err
@ -205,7 +211,11 @@ func runForget(opts ForgetOptions, gopts GlobalOptions, args []string) error {
}
if len(removeSnIDs) > 0 && opts.Prune && !opts.DryRun {
return pruneRepository(gopts, repo)
if !gopts.JSON {
Verbosef("%d snapshots have been removed, running prune\n", len(removeSnIDs))
}
pruneOptions.DryRun = opts.DryRun
return runPruneWithRepo(pruneOptions, gopts, repo, removeSnIDs)
}
return nil

View file

@ -1,15 +1,24 @@
package main
import (
"math"
"sort"
"strconv"
"strings"
"github.com/restic/restic/internal/debug"
"github.com/restic/restic/internal/errors"
"github.com/restic/restic/internal/index"
"github.com/restic/restic/internal/pack"
"github.com/restic/restic/internal/repository"
"github.com/restic/restic/internal/restic"
"github.com/spf13/cobra"
)
var errorIndexIncomplete = errors.Fatal("index is not complete")
var errorPacksMissing = errors.Fatal("packs from index missing in repo")
var errorSizeNotMatching = errors.Fatal("pack size does not match calculated size from index")
var cmdPrune = &cobra.Command{
Use: "prune [flags]",
Short: "Remove unneeded data from the repository",
@ -24,12 +33,91 @@ Exit status is 0 if the command was successful, and non-zero if there was any er
`,
DisableAutoGenTag: true,
RunE: func(cmd *cobra.Command, args []string) error {
return runPrune(globalOptions)
return runPrune(pruneOptions, globalOptions)
},
}
// PruneOptions collects all options for the cleanup command.
type PruneOptions struct {
DryRun bool
MaxUnused string
maxUnusedBytes func(used uint64) (unused uint64) // calculates the number of unused bytes after repacking, according to MaxUnused
MaxRepackSize string
MaxRepackBytes uint64
RepackCachableOnly bool
}
var pruneOptions PruneOptions
func init() {
cmdRoot.AddCommand(cmdPrune)
f := cmdPrune.Flags()
f.BoolVarP(&pruneOptions.DryRun, "dry-run", "n", false, "do not modify the repository, just print what would be done")
addPruneOptions(cmdPrune)
}
func addPruneOptions(c *cobra.Command) {
f := c.Flags()
f.StringVar(&pruneOptions.MaxUnused, "max-unused", "5%", "tolerate given `limit` of unused data (absolute value in bytes with suffixes k/K, m/M, g/G, t/T, a value in % or the word 'unlimited')")
f.StringVar(&pruneOptions.MaxRepackSize, "max-repack-size", "", "maximum `size` to repack (allowed suffixes: k/K, m/M, g/G, t/T)")
f.BoolVar(&pruneOptions.RepackCachableOnly, "repack-cacheable-only", false, "only repack packs which are cacheable")
}
func verifyPruneOptions(opts *PruneOptions) error {
if len(opts.MaxRepackSize) > 0 {
size, err := parseSizeStr(opts.MaxRepackSize)
if err != nil {
return err
}
opts.MaxRepackBytes = uint64(size)
}
maxUnused := strings.TrimSpace(opts.MaxUnused)
if maxUnused == "" {
return errors.Fatalf("invalid value for --max-unused: %q", opts.MaxUnused)
}
// parse MaxUnused either as unlimited, a percentage, or an absolute number of bytes
switch {
case maxUnused == "unlimited":
opts.maxUnusedBytes = func(used uint64) uint64 {
return math.MaxUint64
}
case strings.HasSuffix(maxUnused, "%"):
maxUnused = strings.TrimSuffix(maxUnused, "%")
p, err := strconv.ParseFloat(maxUnused, 64)
if err != nil {
return errors.Fatalf("invalid percentage %q passed for --max-unused: %v", opts.MaxUnused, err)
}
if p < 0 {
return errors.Fatal("percentage for --max-unused must be positive")
}
if p >= 100 {
return errors.Fatal("percentage for --max-unused must be below 100%")
}
opts.maxUnusedBytes = func(used uint64) uint64 {
return uint64(p / (100 - p) * float64(used))
}
default:
size, err := parseSizeStr(maxUnused)
if err != nil {
return errors.Fatalf("invalid number of bytes %q for --max-unused: %v", opts.MaxUnused, err)
}
opts.maxUnusedBytes = func(used uint64) uint64 {
return uint64(size)
}
}
return nil
}
func shortenStatus(maxLength int, s string) string {
@ -44,7 +132,12 @@ func shortenStatus(maxLength int, s string) string {
return s[:maxLength-3] + "..."
}
func runPrune(gopts GlobalOptions) error {
func runPrune(opts PruneOptions, gopts GlobalOptions) error {
err := verifyPruneOptions(&opts)
if err != nil {
return err
}
repo, err := OpenRepository(gopts)
if err != nil {
return err
@ -56,203 +149,345 @@ func runPrune(gopts GlobalOptions) error {
return err
}
return runPruneWithRepo(opts, gopts, repo, restic.NewIDSet())
}
func runPruneWithRepo(opts PruneOptions, gopts GlobalOptions, repo *repository.Repository, ignoreSnapshots restic.IDSet) error {
// we do not need index updates while pruning!
repo.DisableAutoIndexUpdate()
return pruneRepository(gopts, repo)
}
func mixedBlobs(list []restic.Blob) bool {
var tree, data bool
for _, pb := range list {
switch pb.Type {
case restic.TreeBlob:
tree = true
case restic.DataBlob:
data = true
}
if tree && data {
return true
}
}
return false
}
func pruneRepository(gopts GlobalOptions, repo restic.Repository) error {
ctx := gopts.ctx
err := repo.LoadIndex(ctx)
Verbosef("loading all snapshots...\n")
snapshots, err := restic.LoadAllSnapshots(gopts.ctx, repo, ignoreSnapshots)
if err != nil {
return err
}
var stats struct {
blobs int
packs int
snapshots int
bytes int64
}
Verbosef("counting files in repo\n")
err = repo.List(ctx, restic.PackFile, func(restic.ID, int64) error {
stats.packs++
return nil
})
Verbosef("loading indexes...\n")
err = repo.LoadIndex(gopts.ctx)
if err != nil {
return err
}
Verbosef("building new index for repo\n")
bar := newProgressMax(!gopts.Quiet, uint64(stats.packs), "packs")
idx, invalidFiles, err := index.New(ctx, repo, restic.NewIDSet(), bar)
if err != nil {
return err
}
for _, id := range invalidFiles {
Warnf("incomplete pack file (will be removed): %v\n", id)
}
blobs := 0
for _, pack := range idx.Packs {
stats.bytes += pack.Size
blobs += len(pack.Entries)
}
Verbosef("repository contains %v packs (%v blobs) with %v\n",
len(idx.Packs), blobs, formatBytes(uint64(stats.bytes)))
blobCount := make(map[restic.BlobHandle]int)
var duplicateBlobs uint64
var duplicateBytes uint64
// find duplicate blobs
for _, p := range idx.Packs {
for _, entry := range p.Entries {
stats.blobs++
h := restic.BlobHandle{ID: entry.ID, Type: entry.Type}
blobCount[h]++
if blobCount[h] > 1 {
duplicateBlobs++
duplicateBytes += uint64(entry.Length)
}
}
}
Verbosef("processed %d blobs: %d duplicate blobs, %v duplicate\n",
stats.blobs, duplicateBlobs, formatBytes(uint64(duplicateBytes)))
Verbosef("load all snapshots\n")
// find referenced blobs
snapshots, err := restic.LoadAllSnapshots(ctx, repo)
if err != nil {
return err
}
stats.snapshots = len(snapshots)
usedBlobs, err := getUsedBlobs(gopts, repo, snapshots)
if err != nil {
return err
}
var missingBlobs []restic.BlobHandle
for h := range usedBlobs {
if _, ok := blobCount[h]; !ok {
missingBlobs = append(missingBlobs, h)
return prune(opts, gopts, repo, usedBlobs)
}
type packInfo struct {
usedBlobs uint
unusedBlobs uint
duplicateBlobs uint
usedSize uint64
unusedSize uint64
tpe restic.BlobType
}
type packInfoWithID struct {
ID restic.ID
packInfo
}
// prune selects which files to rewrite and then does that. The map usedBlobs is
// modified in the process.
func prune(opts PruneOptions, gopts GlobalOptions, repo restic.Repository, usedBlobs restic.BlobSet) error {
ctx := gopts.ctx
var stats struct {
blobs struct {
used uint
duplicate uint
unused uint
remove uint
repack uint
repackrm uint
}
size struct {
used uint64
duplicate uint64
unused uint64
remove uint64
repack uint64
repackrm uint64
unref uint64
}
packs struct {
used uint
unused uint
partlyUsed uint
keep uint
}
}
if len(missingBlobs) > 0 {
return errors.Fatalf("%v not found in the new index\n"+
Verbosef("searching used packs...\n")
keepBlobs := restic.NewBlobSet()
duplicateBlobs := restic.NewBlobSet()
// iterate over all blobs in index to find out which blobs are duplicates
for blob := range repo.Index().Each(ctx) {
bh := blob.Handle()
switch {
case usedBlobs.Has(bh): // used blob, move to keepBlobs
usedBlobs.Delete(bh)
keepBlobs.Insert(bh)
case keepBlobs.Has(bh): // duplicate blob
duplicateBlobs.Insert(bh)
}
}
// Check if all used blobs have been found in index
if len(usedBlobs) != 0 {
Warnf("%v not found in the new index\n"+
"Data blobs seem to be missing, aborting prune to prevent further data loss!\n"+
"Please report this error (along with the output of the 'prune' run) at\n"+
"https://github.com/restic/restic/issues/new/choose", missingBlobs)
"https://github.com/restic/restic/issues/new/choose", usedBlobs)
return errorIndexIncomplete
}
Verbosef("found %d of %d data blobs still in use, removing %d blobs\n",
len(usedBlobs), stats.blobs, stats.blobs-len(usedBlobs))
indexPack := make(map[restic.ID]packInfo)
// find packs that need a rewrite
rewritePacks := restic.NewIDSet()
for _, pack := range idx.Packs {
if mixedBlobs(pack.Entries) {
rewritePacks.Insert(pack.ID)
continue
// iterate over all blobs in index to generate packInfo
for blob := range repo.Index().Each(ctx) {
ip, ok := indexPack[blob.PackID]
if !ok {
ip = packInfo{tpe: blob.Type, usedSize: pack.HeaderSize}
}
// mark mixed packs with "Invalid blob type"
if ip.tpe != blob.Type {
ip.tpe = restic.InvalidBlob
}
for _, blob := range pack.Entries {
h := restic.BlobHandle{ID: blob.ID, Type: blob.Type}
if !usedBlobs.Has(h) {
rewritePacks.Insert(pack.ID)
continue
}
if blobCount[h] > 1 {
rewritePacks.Insert(pack.ID)
}
bh := blob.Handle()
size := uint64(pack.PackedSizeOfBlob(blob.Length))
switch {
case duplicateBlobs.Has(bh): // duplicate blob
ip.usedSize += size
ip.duplicateBlobs++
stats.size.duplicate += size
stats.blobs.duplicate++
case keepBlobs.Has(bh): // used blob, not duplicate
ip.usedSize += size
ip.usedBlobs++
stats.size.used += size
stats.blobs.used++
default: // unused blob
ip.unusedSize += size
ip.unusedBlobs++
stats.size.unused += size
stats.blobs.unused++
}
// update indexPack
indexPack[blob.PackID] = ip
}
removeBytes := duplicateBytes
// find packs that are unneeded
Verbosef("collecting packs for deletion and repacking\n")
removePacksFirst := restic.NewIDSet()
removePacks := restic.NewIDSet()
repackPacks := restic.NewIDSet()
Verbosef("will remove %d invalid files\n", len(invalidFiles))
for _, id := range invalidFiles {
removePacks.Insert(id)
}
var repackCandidates []packInfoWithID
for packID, p := range idx.Packs {
hasActiveBlob := false
for _, blob := range p.Entries {
h := restic.BlobHandle{ID: blob.ID, Type: blob.Type}
if usedBlobs.Has(h) {
hasActiveBlob = true
continue
}
removeBytes += uint64(blob.Length)
// loop over all packs and decide what to do
bar := newProgressMax(!gopts.Quiet, uint64(len(indexPack)), "packs processed")
bar.Start()
err := repo.List(ctx, restic.PackFile, func(id restic.ID, packSize int64) error {
p, ok := indexPack[id]
if !ok {
// Pack was not referenced in index and is not used => immediately remove!
Verboseff("will remove pack %v as it is unused and not indexed\n", id.Str())
removePacksFirst.Insert(id)
stats.size.unref += uint64(packSize)
return nil
}
if hasActiveBlob {
continue
if p.unusedSize+p.usedSize != uint64(packSize) {
Warnf("pack %s: calculated size %d does not match real size %d\nRun 'restic rebuild-index'.",
id.Str(), p.unusedSize+p.usedSize, packSize)
return errorSizeNotMatching
}
removePacks.Insert(packID)
if !rewritePacks.Has(packID) {
return errors.Fatalf("pack %v is unneeded, but not contained in rewritePacks", packID.Str())
// statistics
switch {
case p.usedBlobs == 0 && p.duplicateBlobs == 0:
stats.packs.unused++
case p.unusedBlobs == 0:
stats.packs.used++
default:
stats.packs.partlyUsed++
}
rewritePacks.Delete(packID)
}
// decide what to do
switch {
case p.usedBlobs == 0 && p.duplicateBlobs == 0:
// All blobs in pack are no longer used => remove pack!
removePacks.Insert(id)
stats.blobs.remove += p.unusedBlobs
stats.size.remove += p.unusedSize
Verbosef("will delete %d packs and rewrite %d packs, this frees %s\n",
len(removePacks), len(rewritePacks), formatBytes(uint64(removeBytes)))
case opts.RepackCachableOnly && p.tpe == restic.DataBlob:
// if this is a data pack and --repack-cacheable-only is set => keep pack!
stats.packs.keep++
var obsoletePacks restic.IDSet
if len(rewritePacks) != 0 {
bar := newProgressMax(!gopts.Quiet, uint64(len(rewritePacks)), "packs rewritten")
obsoletePacks, err = repository.Repack(ctx, repo, rewritePacks, usedBlobs, bar)
if err != nil {
return err
case p.unusedBlobs == 0 && p.duplicateBlobs == 0 && p.tpe != restic.InvalidBlob:
// All blobs in pack are used and not duplicates/mixed => keep pack!
stats.packs.keep++
default:
// all other packs are candidates for repacking
repackCandidates = append(repackCandidates, packInfoWithID{ID: id, packInfo: p})
}
}
removePacks.Merge(obsoletePacks)
if err = rebuildIndex(ctx, repo, removePacks); err != nil {
delete(indexPack, id)
bar.Report(restic.Stat{Blobs: 1})
return nil
})
bar.Done()
if err != nil {
return err
}
if len(indexPack) != 0 {
Warnf("The index references pack files which are missing from the repository: %v\n", indexPack)
return errorPacksMissing
}
repackAllPacksWithDuplicates := true
// calculate limit for number of unused bytes in the repo after repacking
maxUnusedSizeAfter := opts.maxUnusedBytes(stats.size.used)
// Sort repackCandidates such that packs with highest ratio unused/used space are picked first.
// This is equivalent to sorting by unused / total space.
// Instead of unused[i] / used[i] > unused[j] / used[j] we use
// unused[i] * used[j] > unused[j] * used[i] as uint32*uint32 < uint64
// Morover duplicates and mixed are sorted to the beginning
sort.Slice(repackCandidates, func(i, j int) bool {
pi := repackCandidates[i].packInfo
pj := repackCandidates[j].packInfo
switch {
case pi.duplicateBlobs > 0 && pj.duplicateBlobs == 0:
return true
case pj.duplicateBlobs > 0 && pi.duplicateBlobs == 0:
return false
case pi.tpe == restic.InvalidBlob && pj.tpe != restic.InvalidBlob:
return true
case pj.tpe == restic.InvalidBlob && pi.tpe != restic.InvalidBlob:
return false
}
return pi.unusedSize*pj.usedSize > pj.unusedSize*pi.usedSize
})
repack := func(id restic.ID, p packInfo) {
repackPacks.Insert(id)
stats.blobs.repack += p.unusedBlobs + p.duplicateBlobs + p.usedBlobs
stats.size.repack += p.unusedSize + p.usedSize
stats.blobs.repackrm += p.unusedBlobs
stats.size.repackrm += p.unusedSize
}
for _, p := range repackCandidates {
reachedUnusedSizeAfter := (stats.size.unused-stats.size.remove-stats.size.repackrm < maxUnusedSizeAfter)
reachedRepackSize := false
if opts.MaxRepackBytes > 0 {
reachedRepackSize = stats.size.repack+p.unusedSize+p.usedSize > opts.MaxRepackBytes
}
switch {
case !reachedRepackSize && (p.duplicateBlobs > 0 || p.tpe == restic.InvalidBlob):
// repacking duplicates/mixed is only limited by repackSize
repack(p.ID, p.packInfo)
case reachedUnusedSizeAfter, reachedRepackSize:
// for all other packs stop repacking if tolerated unused size is reached.
stats.packs.keep++
if p.duplicateBlobs > 0 {
repackAllPacksWithDuplicates = false
}
default:
repack(p.ID, p.packInfo)
}
}
// if all duplicates are repacked, print out correct statistics
if repackAllPacksWithDuplicates {
stats.blobs.repackrm += stats.blobs.duplicate
stats.size.repackrm += stats.size.duplicate
}
Verboseff("\nused: %10d blobs / %s\n", stats.blobs.used, formatBytes(stats.size.used))
if stats.blobs.duplicate > 0 {
Verboseff("duplicates: %10d blobs / %s\n", stats.blobs.duplicate, formatBytes(stats.size.duplicate))
}
Verboseff("unused: %10d blobs / %s\n", stats.blobs.unused, formatBytes(stats.size.unused))
if stats.size.unref > 0 {
Verboseff("unreferenced: %s\n", formatBytes(stats.size.unref))
}
totalBlobs := stats.blobs.used + stats.blobs.unused + stats.blobs.duplicate
totalSize := stats.size.used + stats.size.duplicate + stats.size.unused + stats.size.unref
Verboseff("total: %10d blobs / %s\n", totalBlobs, formatBytes(totalSize))
Verboseff("unused size: %s of total size\n", formatPercent(stats.size.unused, totalSize))
Verbosef("\nto repack: %10d blobs / %s\n", stats.blobs.repack, formatBytes(stats.size.repack))
Verbosef("this removes %10d blobs / %s\n", stats.blobs.repackrm, formatBytes(stats.size.repackrm))
Verbosef("to delete: %10d blobs / %s\n", stats.blobs.remove, formatBytes(stats.size.remove+stats.size.unref))
totalPruneSize := stats.size.remove + stats.size.repackrm + stats.size.unref
Verbosef("total prune: %10d blobs / %s\n", stats.blobs.remove+stats.blobs.repackrm, formatBytes(totalPruneSize))
Verbosef("remaining: %10d blobs / %s\n", totalBlobs-(stats.blobs.remove+stats.blobs.repackrm), formatBytes(totalSize-totalPruneSize))
unusedAfter := stats.size.unused - stats.size.remove - stats.size.repackrm
Verbosef("unused size after prune: %s (%s of remaining size)\n",
formatBytes(unusedAfter), formatPercent(unusedAfter, totalSize-totalPruneSize))
Verbosef("\n")
Verboseff("totally used packs: %10d\n", stats.packs.used)
Verboseff("partly used packs: %10d\n", stats.packs.partlyUsed)
Verboseff("unused packs: %10d\n\n", stats.packs.unused)
Verboseff("to keep: %10d packs\n", stats.packs.keep)
Verboseff("to repack: %10d packs\n", len(repackPacks))
Verboseff("to delete: %10d packs\n", len(removePacks))
if len(removePacksFirst) > 0 {
Verboseff("to delete: %10d unreferenced packs\n\n", len(removePacksFirst))
}
if opts.DryRun {
if !gopts.JSON && gopts.verbosity >= 2 {
if len(removePacksFirst) > 0 {
Printf("Would have removed the following unreferenced packs:\n%v\n\n", removePacksFirst)
}
Printf("Would have repacked and removed the following packs:\n%v\n\n", repackPacks)
Printf("Would have removed the following no longer used packs:\n%v\n\n", removePacks)
}
// Always quit here if DryRun was set!
return nil
}
// unreferenced packs can be safely deleted first
if len(removePacksFirst) != 0 {
Verbosef("deleting unreferenced packs\n")
DeleteFiles(gopts, repo, removePacksFirst, restic.PackFile)
}
if len(repackPacks) != 0 {
Verbosef("repacking packs\n")
bar := newProgressMax(!gopts.Quiet, uint64(len(repackPacks)), "packs repacked")
_, err := repository.Repack(ctx, repo, repackPacks, keepBlobs, bar)
if err != nil {
return err
}
// Also remove repacked packs
removePacks.Merge(repackPacks)
}
if len(removePacks) != 0 {
Verbosef("remove %d old packs\n", len(removePacks))
if err = rebuildIndex(ctx, repo, removePacks); err != nil {
return err
}
Verbosef("removing %d old packs\n", len(removePacks))
DeleteFiles(gopts, repo, removePacks, restic.PackFile)
}
@ -263,7 +498,7 @@ func pruneRepository(gopts GlobalOptions, repo restic.Repository) error {
func getUsedBlobs(gopts GlobalOptions, repo restic.Repository, snapshots []*restic.Snapshot) (usedBlobs restic.BlobSet, err error) {
ctx := gopts.ctx
Verbosef("find data that is still in use for %d snapshots\n", len(snapshots))
Verbosef("finding data that is still in use for %d snapshots\n", len(snapshots))
usedBlobs = restic.NewBlobSet()

View file

@ -231,6 +231,13 @@ func Verbosef(format string, args ...interface{}) {
}
}
// Verboseff calls Printf to write the message when the verbosity is >= 2
func Verboseff(format string, args ...interface{}) {
if globalOptions.verbosity >= 2 {
Printf(format, args...)
}
}
// PrintProgress wraps fmt.Printf to handle the difference in writing progress
// information to terminals and non-terminal stdout
func PrintProgress(format string, args ...interface{}) {

View file

@ -270,8 +270,8 @@ func testRunForgetJSON(t testing.TB, gopts GlobalOptions, args ...string) {
"Expected 2 snapshots to be removed, got %v", len(forgets[0].Remove))
}
func testRunPrune(t testing.TB, gopts GlobalOptions) {
rtest.OK(t, runPrune(gopts))
func testRunPrune(t testing.TB, gopts GlobalOptions, opts PruneOptions) {
rtest.OK(t, runPrune(opts, gopts))
}
func testSetupBackupData(t testing.TB, env *testEnvironment) string {
@ -1386,6 +1386,32 @@ func TestCheckRestoreNoLock(t *testing.T) {
}
func TestPrune(t *testing.T) {
t.Run("0", func(t *testing.T) {
opts := PruneOptions{MaxUnused: "0%"}
checkOpts := CheckOptions{ReadData: true, CheckUnused: true}
testPrune(t, opts, checkOpts)
})
t.Run("50", func(t *testing.T) {
opts := PruneOptions{MaxUnused: "50%"}
checkOpts := CheckOptions{ReadData: true}
testPrune(t, opts, checkOpts)
})
t.Run("unlimited", func(t *testing.T) {
opts := PruneOptions{MaxUnused: "unlimited"}
checkOpts := CheckOptions{ReadData: true}
testPrune(t, opts, checkOpts)
})
t.Run("CachableOnly", func(t *testing.T) {
opts := PruneOptions{MaxUnused: "5%", RepackCachableOnly: true}
checkOpts := CheckOptions{ReadData: true}
testPrune(t, opts, checkOpts)
})
}
func testPrune(t *testing.T, pruneOpts PruneOptions, checkOpts CheckOptions) {
env, cleanup := withTestEnvironment(t)
defer cleanup()
@ -1406,10 +1432,12 @@ func TestPrune(t *testing.T) {
testRunForgetJSON(t, env.gopts)
testRunForget(t, env.gopts, firstSnapshot[0].String())
testRunPrune(t, env.gopts)
testRunCheck(t, env.gopts)
testRunPrune(t, env.gopts, pruneOpts)
rtest.OK(t, runCheck(checkOpts, env.gopts, nil))
}
var pruneDefaultOptions = PruneOptions{MaxUnused: "5%"}
func listPacks(gopts GlobalOptions, t *testing.T) restic.IDSet {
r, err := OpenRepository(gopts)
rtest.OK(t, err)
@ -1452,14 +1480,8 @@ func TestPruneWithDamagedRepository(t *testing.T) {
"expected one snapshot, got %v", snapshotIDs)
// prune should fail
err := runPrune(env.gopts)
if err == nil {
t.Fatalf("expected prune to fail")
}
if !strings.Contains(err.Error(), "blobs seem to be missing") {
t.Fatalf("did not find hint for missing blobs")
}
t.Log(err)
rtest.Assert(t, runPrune(pruneDefaultOptions, env.gopts) == errorPacksMissing,
"prune should have reported index not complete error")
}
// Test repos for edge cases
@ -1469,37 +1491,37 @@ func TestEdgeCaseRepos(t *testing.T) {
// repo where index is completely missing
// => check and prune should fail
t.Run("no-index", func(t *testing.T) {
testEdgeCaseRepo(t, "repo-index-missing.tar.gz", opts, false, false)
testEdgeCaseRepo(t, "repo-index-missing.tar.gz", opts, pruneDefaultOptions, false, false)
})
// repo where an existing and used blob is missing from the index
// => check should fail, prune should heal this
// => check and prune should fail
t.Run("index-missing-blob", func(t *testing.T) {
testEdgeCaseRepo(t, "repo-index-missing-blob.tar.gz", opts, false, true)
testEdgeCaseRepo(t, "repo-index-missing-blob.tar.gz", opts, pruneDefaultOptions, false, false)
})
// repo where a blob is missing
// => check and prune should fail
t.Run("no-data", func(t *testing.T) {
testEdgeCaseRepo(t, "repo-data-missing.tar.gz", opts, false, false)
testEdgeCaseRepo(t, "repo-data-missing.tar.gz", opts, pruneDefaultOptions, false, false)
})
// repo where data exists that is not referenced
// => check and prune should fully work
t.Run("unreferenced-data", func(t *testing.T) {
testEdgeCaseRepo(t, "repo-unreferenced-data.tar.gz", opts, true, true)
testEdgeCaseRepo(t, "repo-unreferenced-data.tar.gz", opts, pruneDefaultOptions, true, true)
})
// repo where an obsolete index still exists
// => check and prune should fully work
t.Run("obsolete-index", func(t *testing.T) {
testEdgeCaseRepo(t, "repo-obsolete-index.tar.gz", opts, true, true)
testEdgeCaseRepo(t, "repo-obsolete-index.tar.gz", opts, pruneDefaultOptions, true, true)
})
// repo which contains mixed (data/tree) packs
// => check and prune should fully work
t.Run("mixed-packs", func(t *testing.T) {
testEdgeCaseRepo(t, "repo-mixed.tar.gz", opts, true, true)
testEdgeCaseRepo(t, "repo-mixed.tar.gz", opts, pruneDefaultOptions, true, true)
})
// repo which contains duplicate blobs
@ -1510,11 +1532,11 @@ func TestEdgeCaseRepos(t *testing.T) {
CheckUnused: true,
}
t.Run("duplicates", func(t *testing.T) {
testEdgeCaseRepo(t, "repo-duplicates.tar.gz", opts, false, true)
testEdgeCaseRepo(t, "repo-duplicates.tar.gz", opts, pruneDefaultOptions, false, true)
})
}
func testEdgeCaseRepo(t *testing.T, tarfile string, options CheckOptions, checkOK, pruneOK bool) {
func testEdgeCaseRepo(t *testing.T, tarfile string, optionsCheck CheckOptions, optionsPrune PruneOptions, checkOK, pruneOK bool) {
env, cleanup := withTestEnvironment(t)
defer cleanup()
@ -1524,15 +1546,15 @@ func testEdgeCaseRepo(t *testing.T, tarfile string, options CheckOptions, checkO
if checkOK {
testRunCheck(t, env.gopts)
} else {
rtest.Assert(t, runCheck(options, env.gopts, nil) != nil,
rtest.Assert(t, runCheck(optionsCheck, env.gopts, nil) != nil,
"check should have reported an error")
}
if pruneOK {
testRunPrune(t, env.gopts)
testRunPrune(t, env.gopts, optionsPrune)
testRunCheck(t, env.gopts)
} else {
rtest.Assert(t, runPrune(env.gopts) != nil,
rtest.Assert(t, runPrune(optionsPrune, env.gopts) != nil,
"prune should have reported an error")
}
}

View file

@ -23,12 +23,11 @@ data that was referenced by the snapshot from the repository. This can
be automated with the ``--prune`` option of the ``forget`` command,
which runs ``prune`` automatically if snapshots have been removed.
.. Warning::
Pruning snapshots can be a very time-consuming process, taking nearly
as long as backups themselves. During a prune operation, the index is
locked and backups cannot be completed. Performance improvements are
planned for this feature.
Pruning snapshots can be a time-consuming process, depending on the
amount of snapshots and data to process. During a prune operation, the
repository is locked and backups cannot be completed. Please plan your
pruning so that there's time to complete it and it doesn't interfere with
regular backup runs.
It is advisable to run ``restic check`` after pruning, to make sure
you are alerted, should the internal data structures of the repository
@ -82,20 +81,32 @@ command must be run:
$ restic -r /srv/restic-repo prune
enter password for repository:
repository 33002c5e opened successfully, password is correct
loading all snapshots...
loading indexes...
finding data that is still in use for 4 snapshots
[0:00] 100.00% 4 / 4 snapshots
searching used packs...
collecting packs for deletion and repacking
[0:00] 100.00% 5 / 5 packs processed
to repack: 69 blobs / 1.078 MiB
this removes 67 blobs / 1.047 MiB
to delete: 7 blobs / 25.726 KiB
total prune: 74 blobs / 1.072 MiB
remaining: 16 blobs / 38.003 KiB
unused size after prune: 0 B (0.00% of remaining size)
repacking packs
[0:00] 100.00% 2 / 2 packs repacked
counting files in repo
building new index for repo
[0:00] 100.00% 22 / 22 files
repository contains 22 packs (8512 blobs) with 100.092 MiB bytes
processed 8512 blobs: 0 duplicate blobs, 0B duplicate
load all snapshots
find data that is still in use for 1 snapshots
[0:00] 100.00% 1 / 1 snapshots
found 8433 of 8512 data blobs still in use
will rewrite 3 packs
creating new index
[0:00] 86.36% 19 / 22 files
saved new index as 544a5084
[0:00] 100.00% 3 / 3 packs
finding old index files
saved new indexes as [59270b3a]
remove 4 old index files
[0:00] 100.00% 4 / 4 files deleted
removing 3 old packs
[0:00] 100.00% 3 / 3 files deleted
done
Afterwards the repository is smaller.
@ -119,19 +130,31 @@ to ``forget``:
8c02b94b 2017-02-21 10:48:33 mopped /home/user/work
1 snapshots have been removed, running prune
counting files in repo
building new index for repo
[0:00] 100.00% 37 / 37 packs
repository contains 37 packs (5521 blobs) with 151.012 MiB bytes
processed 5521 blobs: 0 duplicate blobs, 0B duplicate
load all snapshots
find data that is still in use for 1 snapshots
loading all snapshots...
loading indexes...
finding data that is still in use for 1 snapshots
[0:00] 100.00% 1 / 1 snapshots
found 5323 of 5521 data blobs still in use, removing 198 blobs
will delete 0 packs and rewrite 27 packs, this frees 22.106 MiB
creating new index
[0:00] 100.00% 30 / 30 packs
saved new index as b49f3e68
searching used packs...
collecting packs for deletion and repacking
[0:00] 100.00% 5 / 5 packs processed
to repack: 69 blobs / 1.078 MiB
this removes 67 blobs / 1.047 MiB
to delete: 7 blobs / 25.726 KiB
total prune: 74 blobs / 1.072 MiB
remaining: 16 blobs / 38.003 KiB
unused size after prune: 0 B (0.00% of remaining size)
repacking packs
[0:00] 100.00% 2 / 2 packs repacked
counting files in repo
[0:00] 100.00% 3 / 3 packs
finding old index files
saved new indexes as [59270b3a]
remove 4 old index files
[0:00] 100.00% 4 / 4 files deleted
removing 3 old packs
[0:00] 100.00% 3 / 3 files deleted
done
Removing snapshots according to a policy
@ -282,3 +305,59 @@ last-day-of-the-months (11 or 12 depends if the 5 weeklies cross a month).
And finally 75 last-day-of-the-year snapshots. All other snapshots are
removed.
Customize pruning
*****************
To understand the custom options, we first explain how the pruning process works:
1. All snapshots and directories within snapshots are scanned to determine
which data is still in use.
2. For all files in the repository, restic finds out if the file is fully
used, partly used or completely unused.
3. Completely unused files are marked for deletion. Fully used files are kept.
A partially used file is either kept or marked for repacking depending on user
options.
Note that for repacking, restic must download the file from the repository
storage and re-upload the needed data in the repository. This can be very
time-consuming for remote repositories.
4. After deciding what to do, ``prune`` will actually perform the repack, modify
the index according to the changes and delete the obsolete files.
The ``prune`` command accepts the following options:
- ``--max-unused limit`` allow unused data up to the specified limit within the repository.
This allows restic to keep partly used files instead of repacking them.
The limit can be specified in several ways:
* As an absolute size (e.g. ``200M``). If you want to minimize the space
used by your repository, pass ``0`` to this option.
* As a size relative to the total repo size (e.g. ``10%``). This means that
after prune, at most ``10%`` of the total data stored in the repo may be
unused data. If the repo after prune has as size of 500MB, then at most
50MB may be unused.
* If the string ``unlimited`` is passed, there is no limit for partly
unused files. This means that as long as some data is still used within
a file stored in the repo, restic will just leave it there. Use this if
you want to minimize the time and bandwidth used by the ``prune``
operation.
Restic tries to repack as little data as possible while still ensuring this
limit for unused data.
- ``--max-repack-size size`` if set limits the total size of files to repack.
As ``prune`` first stores all repacked files and deletes the obsolete files at the end,
this option might be handy if you expect many files to be repacked and fear to run low
on storage.
- ``--repack-cacheable-only`` if set to true only files which contain
metadata and would be stored in the cache are repacked. Other pack files are
not repacked if this option is set. This allows a very fast repacking
using only cached data. It can, however, imply that the unused data in
your repository exceeds the value given by ``--max-unused``.
The default value is false.
- ``--dry-run`` only show what ``prune`` would do.
- ``--verbose`` increased verbosity shows additional statistics for ``prune``.

View file

@ -161,13 +161,16 @@ func (p *Packer) String() string {
}
var (
// size of the header-length field at the end of the file
headerLengthSize = binary.Size(uint32(0))
// we require at least one entry in the header, and one blob for a pack file
minFileSize = entrySize + crypto.Extension + uint(headerLengthSize)
)
const (
// size of the header-length field at the end of the file; it is a uint32
headerLengthSize = 4
// constant overhead of the header independent of #entries
HeaderSize = headerLengthSize + crypto.Extension
maxHeaderSize = 16 * 1024 * 1024
// number of header enries to download as part of header-length request
eagerEntries = 15
@ -315,3 +318,8 @@ func List(k *crypto.Key, rd io.ReaderAt, size int64) (entries []restic.Blob, err
return entries, nil
}
// PackedSizeOfBlob returns the size a blob actually uses when saved in a pack
func PackedSizeOfBlob(blobLength uint) uint {
return blobLength + entrySize
}

View file

@ -19,6 +19,10 @@ func (b Blob) String() string {
b.Type, b.ID.Str(), b.Offset, b.Length)
}
func (b Blob) Handle() BlobHandle {
return BlobHandle{ID: b.ID, Type: b.Type}
}
// PackedBlob is a blob stored within a file.
type PackedBlob struct {
Blob

View file

@ -67,8 +67,12 @@ func LoadSnapshot(ctx context.Context, repo Repository, id ID) (*Snapshot, error
}
// LoadAllSnapshots returns a list of all snapshots in the repo.
func LoadAllSnapshots(ctx context.Context, repo Repository) (snapshots []*Snapshot, err error) {
// If a snapshot ID is in excludeIDs, it will not be included in the result.
func LoadAllSnapshots(ctx context.Context, repo Repository, excludeIDs IDSet) (snapshots []*Snapshot, err error) {
err = repo.List(ctx, SnapshotFile, func(id ID, size int64) error {
if excludeIDs.Has(id) {
return nil
}
sn, err := LoadSnapshot(ctx, repo, id)
if err != nil {
return err

View file

@ -25,7 +25,7 @@ func TestCreateSnapshot(t *testing.T) {
restic.TestCreateSnapshot(t, repo, testSnapshotTime.Add(time.Duration(i)*time.Second), testDepth, 0)
}
snapshots, err := restic.LoadAllSnapshots(context.TODO(), repo)
snapshots, err := restic.LoadAllSnapshots(context.TODO(), repo, restic.NewIDSet())
if err != nil {
t.Fatal(err)
}