Reimplementation of prune

This commit is contained in:
Alexander Weiss 2020-07-19 07:55:14 +02:00 committed by Alexander Neumann
parent 3b591ed987
commit 7f9a0a5907
5 changed files with 553 additions and 211 deletions

View file

@ -0,0 +1,22 @@
Enhancement: Improve pruning performance and make pruning more customizable
The prune command is now much faster. This is especially the case for remote
repositories or repositories with not much data to prune.
Also the memory usage of the prune command is now reduced.
By default the prune command now no longer removes all unused blobs. This
behavior can be fine-tuned by new options, like tolerated unused space or
maximum size of packs to repack. For more details, see
https://restic.readthedocs.io/en/stable/060_forget.html
Moreover, prune now accepts the dry-run option and forget --dry-run --prune
also shows what prune would do.
Fixes several open issues, e.g.:
https://github.com/restic/restic/issues/1140
https://github.com/restic/restic/issues/1985
https://github.com/restic/restic/issues/2112
https://github.com/restic/restic/issues/2227
https://github.com/restic/restic/issues/2305
https://github.com/restic/restic/pull/2718

View file

@ -80,9 +80,15 @@ func init() {
f.BoolVar(&forgetOptions.Prune, "prune", false, "automatically run the 'prune' command if snapshots have been removed")
f.SortFlags = false
addPruneOptions(cmdForget)
}
func runForget(opts ForgetOptions, gopts GlobalOptions, args []string) error {
err := verifyPruneOptions(&pruneOptions)
if err != nil {
return err
}
repo, err := OpenRepository(gopts)
if err != nil {
return err
@ -205,7 +211,12 @@ func runForget(opts ForgetOptions, gopts GlobalOptions, args []string) error {
}
if len(removeSnIDs) > 0 && opts.Prune && !opts.DryRun {
return pruneRepository(gopts, repo)
if !gopts.JSON {
Verbosef("%d snapshots have been removed, running prune\n", len(removeSnIDs))
}
pruneOptions.DryRun = opts.DryRun
return runPruneWithRepo(pruneOptions, gopts, repo)
}
return nil

View file

@ -1,15 +1,22 @@
package main
import (
"sort"
"strconv"
"github.com/restic/restic/internal/debug"
"github.com/restic/restic/internal/errors"
"github.com/restic/restic/internal/index"
"github.com/restic/restic/internal/pack"
"github.com/restic/restic/internal/repository"
"github.com/restic/restic/internal/restic"
"github.com/spf13/cobra"
)
var errorIndexIncomplete = errors.Fatal("index is not complete")
var errorPacksMissing = errors.Fatal("packs from index missing in repo")
var errorSizeNotMatching = errors.Fatal("pack size does not match calculated size from index")
var cmdPrune = &cobra.Command{
Use: "prune [flags]",
Short: "Remove unneeded data from the repository",
@ -24,12 +31,72 @@ Exit status is 0 if the command was successful, and non-zero if there was any er
`,
DisableAutoGenTag: true,
RunE: func(cmd *cobra.Command, args []string) error {
return runPrune(globalOptions)
return runPrune(pruneOptions, globalOptions)
},
}
// PruneOptions collects all options for the cleanup command.
type PruneOptions struct {
DryRun bool
MaxUnused string
MaxUnusedPercent float64
MaxUnusedBytes uint64
MaxRepackSize string
MaxRepackBytes uint64
// RepackSmall bool <- This option may be added later
RepackCachableOnly bool
}
var pruneOptions PruneOptions
func init() {
cmdRoot.AddCommand(cmdPrune)
f := cmdPrune.Flags()
f.BoolVarP(&pruneOptions.DryRun, "dry-run", "n", false, "do not modify the repository, just print what would be done")
addPruneOptions(cmdPrune)
}
func addPruneOptions(c *cobra.Command) {
f := c.Flags()
f.StringVar(&pruneOptions.MaxUnused, "max-unused", "5%", "tolerate given `limit` of unused space (allowed suffixes: k/K, m/M, g/G, t/T or value in %)")
f.StringVar(&pruneOptions.MaxRepackSize, "max-repack-size", "", "maximum `size` to repack (allowed suffixes: k/K, m/M, g/G, t/T)")
f.BoolVar(&pruneOptions.RepackCachableOnly, "repack-cacheable-only", false, "only repack packs which are cacheable")
// f.BoolVar(&pruneOptions.RepackSmall, "repack-small", false, "also repack small packs")
}
func verifyPruneOptions(opts *PruneOptions) error {
if len(opts.MaxRepackSize) > 0 {
size, err := parseSizeStr(opts.MaxRepackSize)
if err != nil {
return err
}
opts.MaxRepackBytes = uint64(size)
}
length := len(opts.MaxUnused)
if length == 0 {
return nil
}
var err error
if opts.MaxUnused[length-1] == '%' {
opts.MaxUnusedPercent, err = strconv.ParseFloat(opts.MaxUnused[:length-1], 64)
opts.MaxUnusedBytes = ^uint64(0)
} else {
var size int64
size, err = parseSizeStr(opts.MaxUnused)
opts.MaxUnusedPercent = 100.0
opts.MaxUnusedBytes = uint64(size)
}
if err != nil {
return err
}
if opts.MaxUnusedPercent < 0.0 || opts.MaxUnusedPercent > 100.0 {
return errors.Fatalf("--max-unused-percent should be between 0 and 100. Given value: %f", opts.MaxUnusedPercent)
}
return nil
}
func shortenStatus(maxLength int, s string) string {
@ -44,7 +111,12 @@ func shortenStatus(maxLength int, s string) string {
return s[:maxLength-3] + "..."
}
func runPrune(gopts GlobalOptions) error {
func runPrune(opts PruneOptions, gopts GlobalOptions) error {
err := verifyPruneOptions(&opts)
if err != nil {
return err
}
repo, err := OpenRepository(gopts)
if err != nil {
return err
@ -56,203 +128,345 @@ func runPrune(gopts GlobalOptions) error {
return err
}
return runPruneWithRepo(opts, gopts, repo)
}
func runPruneWithRepo(opts PruneOptions, gopts GlobalOptions, repo *repository.Repository) error {
// we do not need index updates while pruning!
repo.DisableAutoIndexUpdate()
return pruneRepository(gopts, repo)
}
func mixedBlobs(list []restic.Blob) bool {
var tree, data bool
for _, pb := range list {
switch pb.Type {
case restic.TreeBlob:
tree = true
case restic.DataBlob:
data = true
}
if tree && data {
return true
}
}
return false
}
func pruneRepository(gopts GlobalOptions, repo restic.Repository) error {
ctx := gopts.ctx
err := repo.LoadIndex(ctx)
Verbosef("loading all snapshots...\n")
snapshots, err := restic.LoadAllSnapshots(gopts.ctx, repo)
if err != nil {
return err
}
var stats struct {
blobs int
packs int
snapshots int
bytes int64
}
Verbosef("counting files in repo\n")
err = repo.List(ctx, restic.PackFile, func(restic.ID, int64) error {
stats.packs++
return nil
})
Verbosef("loading indexes...\n")
err = repo.LoadIndex(gopts.ctx)
if err != nil {
return err
}
Verbosef("building new index for repo\n")
bar := newProgressMax(!gopts.Quiet, uint64(stats.packs), "packs")
idx, invalidFiles, err := index.New(ctx, repo, restic.NewIDSet(), bar)
if err != nil {
return err
}
for _, id := range invalidFiles {
Warnf("incomplete pack file (will be removed): %v\n", id)
}
blobs := 0
for _, pack := range idx.Packs {
stats.bytes += pack.Size
blobs += len(pack.Entries)
}
Verbosef("repository contains %v packs (%v blobs) with %v\n",
len(idx.Packs), blobs, formatBytes(uint64(stats.bytes)))
blobCount := make(map[restic.BlobHandle]int)
var duplicateBlobs uint64
var duplicateBytes uint64
// find duplicate blobs
for _, p := range idx.Packs {
for _, entry := range p.Entries {
stats.blobs++
h := restic.BlobHandle{ID: entry.ID, Type: entry.Type}
blobCount[h]++
if blobCount[h] > 1 {
duplicateBlobs++
duplicateBytes += uint64(entry.Length)
}
}
}
Verbosef("processed %d blobs: %d duplicate blobs, %v duplicate\n",
stats.blobs, duplicateBlobs, formatBytes(uint64(duplicateBytes)))
Verbosef("load all snapshots\n")
// find referenced blobs
snapshots, err := restic.LoadAllSnapshots(ctx, repo)
if err != nil {
return err
}
stats.snapshots = len(snapshots)
usedBlobs, err := getUsedBlobs(gopts, repo, snapshots)
if err != nil {
return err
}
var missingBlobs []restic.BlobHandle
for h := range usedBlobs {
if _, ok := blobCount[h]; !ok {
missingBlobs = append(missingBlobs, h)
return prune(opts, gopts, repo, usedBlobs)
}
type packInfo struct {
usedBlobs uint
unusedBlobs uint
duplicateBlobs uint
usedSize uint64
unusedSize uint64
tpe restic.BlobType
}
type packInfoWithID struct {
ID restic.ID
packInfo
}
func prune(opts PruneOptions, gopts GlobalOptions, repo restic.Repository, usedBlobs restic.BlobSet) error {
ctx := gopts.ctx
var stats struct {
blobs struct {
used uint
duplicate uint
unused uint
remove uint
repack uint
repackrm uint
}
size struct {
used uint64
duplicate uint64
unused uint64
remove uint64
repack uint64
repackrm uint64
unref uint64
}
packs struct {
used uint
unused uint
partlyUsed uint
keep uint
}
}
if len(missingBlobs) > 0 {
return errors.Fatalf("%v not found in the new index\n"+
Verbosef("searching used packs...\n")
keepBlobs := restic.NewBlobSet()
duplicateBlobs := restic.NewBlobSet()
// iterate over all blobs in index to find out which blobs are duplicates
for blob := range repo.Index().Each(ctx) {
bh := blob.Handle()
switch {
case usedBlobs.Has(bh): // used blob, move to keepBlobs
usedBlobs.Delete(bh)
keepBlobs.Insert(bh)
case keepBlobs.Has(bh): // duplicate blob
duplicateBlobs.Insert(bh)
}
}
// Check if all used blobs have been found in index
if len(usedBlobs) != 0 {
Warnf("%v not found in the new index\n"+
"Data blobs seem to be missing, aborting prune to prevent further data loss!\n"+
"Please report this error (along with the output of the 'prune' run) at\n"+
"https://github.com/restic/restic/issues/new/choose", missingBlobs)
"https://github.com/restic/restic/issues/new/choose", usedBlobs)
return errorIndexIncomplete
}
Verbosef("found %d of %d data blobs still in use, removing %d blobs\n",
len(usedBlobs), stats.blobs, stats.blobs-len(usedBlobs))
indexPack := make(map[restic.ID]packInfo)
// find packs that need a rewrite
rewritePacks := restic.NewIDSet()
for _, pack := range idx.Packs {
if mixedBlobs(pack.Entries) {
rewritePacks.Insert(pack.ID)
continue
// iterate over all blobs in index to generate packInfo
for blob := range repo.Index().Each(ctx) {
ip, ok := indexPack[blob.PackID]
if !ok {
ip = packInfo{tpe: blob.Type, usedSize: pack.HeaderSize}
}
// mark mixed packs with "Invalid blob type"
if ip.tpe != blob.Type {
ip.tpe = restic.InvalidBlob
}
for _, blob := range pack.Entries {
h := restic.BlobHandle{ID: blob.ID, Type: blob.Type}
if !usedBlobs.Has(h) {
rewritePacks.Insert(pack.ID)
continue
}
if blobCount[h] > 1 {
rewritePacks.Insert(pack.ID)
}
bh := blob.Handle()
size := uint64(pack.PackedSizeOfBlob(blob.Length))
switch {
case duplicateBlobs.Has(bh): // duplicate blob
ip.usedSize += size
ip.duplicateBlobs++
stats.size.duplicate += size
stats.blobs.duplicate++
case keepBlobs.Has(bh): // used blob, not duplicate
ip.usedSize += size
ip.usedBlobs++
stats.size.used += size
stats.blobs.used++
default: // unused blob
ip.unusedSize += size
ip.unusedBlobs++
stats.size.unused += size
stats.blobs.unused++
}
// update indexPack
indexPack[blob.PackID] = ip
}
removeBytes := duplicateBytes
// find packs that are unneeded
Verbosef("collecting packs for deletion and repacking\n")
removePacksFirst := restic.NewIDSet()
removePacks := restic.NewIDSet()
repackPacks := restic.NewIDSet()
Verbosef("will remove %d invalid files\n", len(invalidFiles))
for _, id := range invalidFiles {
removePacks.Insert(id)
var repackCandidates []packInfoWithID
repack := func(id restic.ID, p packInfo) {
repackPacks.Insert(id)
stats.blobs.repack += p.unusedBlobs + p.duplicateBlobs + p.usedBlobs
stats.size.repack += p.unusedSize + p.usedSize
stats.blobs.repackrm += p.unusedBlobs
stats.size.repackrm += p.unusedSize
}
for packID, p := range idx.Packs {
hasActiveBlob := false
for _, blob := range p.Entries {
h := restic.BlobHandle{ID: blob.ID, Type: blob.Type}
if usedBlobs.Has(h) {
hasActiveBlob = true
continue
}
removeBytes += uint64(blob.Length)
// loop over all packs and decide what to do
bar := newProgressMax(!gopts.Quiet, uint64(len(indexPack)), "packs processed")
bar.Start()
err := repo.List(ctx, restic.PackFile, func(id restic.ID, packSize int64) error {
p, ok := indexPack[id]
if !ok {
// Pack was not referenced in index and is not used => immediately remove!
Verboseff("will remove pack %v as it is unused and not indexed\n", id.Str())
removePacksFirst.Insert(id)
stats.size.unref += uint64(packSize)
return nil
}
if hasActiveBlob {
continue
if p.unusedSize+p.usedSize != uint64(packSize) {
Warnf("pack %s: calculated size %d does not match real size %d\nRun 'restic rebuild-index'.",
id.Str(), p.unusedSize+p.usedSize, packSize)
return errorSizeNotMatching
}
removePacks.Insert(packID)
if !rewritePacks.Has(packID) {
return errors.Fatalf("pack %v is unneeded, but not contained in rewritePacks", packID.Str())
// statistics
switch {
case p.usedBlobs == 0 && p.duplicateBlobs == 0:
stats.packs.unused++
case p.unusedBlobs == 0:
stats.packs.used++
default:
stats.packs.partlyUsed++
}
rewritePacks.Delete(packID)
}
// decide what to do
switch {
case p.usedBlobs == 0 && p.duplicateBlobs == 0:
// All blobs in pack are no longer used => remove pack!
removePacks.Insert(id)
stats.blobs.remove += p.unusedBlobs
stats.size.remove += p.unusedSize
Verbosef("will delete %d packs and rewrite %d packs, this frees %s\n",
len(removePacks), len(rewritePacks), formatBytes(uint64(removeBytes)))
case opts.RepackCachableOnly && p.tpe == restic.DataBlob,
// if this is a data pack and --repack-cacheable-only is set => keep pack!
p.unusedBlobs == 0 && p.duplicateBlobs == 0 && p.tpe != restic.InvalidBlob: // && (!opts.RepackSmall || packSize >= repository.MinPackSize)
// All blobs in pack are used and not duplicates/mixed => keep pack!
stats.packs.keep++
var obsoletePacks restic.IDSet
if len(rewritePacks) != 0 {
bar := newProgressMax(!gopts.Quiet, uint64(len(rewritePacks)), "packs rewritten")
obsoletePacks, err = repository.Repack(ctx, repo, rewritePacks, usedBlobs, bar)
if err != nil {
return err
default:
// all other packs are candidates for repacking
repackCandidates = append(repackCandidates, packInfoWithID{ID: id, packInfo: p})
}
}
removePacks.Merge(obsoletePacks)
if err = rebuildIndex(ctx, repo, removePacks); err != nil {
delete(indexPack, id)
bar.Report(restic.Stat{Blobs: 1})
return nil
})
bar.Done()
if err != nil {
return err
}
if len(indexPack) != 0 {
Warnf("The index references pack files which are missing from the repository: %v\n", indexPack)
return errorPacksMissing
}
repackAllPacksWithDuplicates := true
maxUnusedSizeAfter := opts.MaxUnusedBytes
if opts.MaxUnusedPercent < 100.0 {
maxUnusedSizePercent := uint64(opts.MaxUnusedPercent / (100.0 - opts.MaxUnusedPercent) * float64(stats.size.used))
if maxUnusedSizePercent < maxUnusedSizeAfter {
maxUnusedSizeAfter = maxUnusedSizePercent
}
}
// Sort repackCandidates such that packs with highest ratio unused/used space are picked first.
// This is equivalent to sorting by unused / total space.
// Instead of unused[i] / used[i] > unused[j] / used[j] we use
// unused[i] * used[j] > unused[j] * used[i] as uint32*uint32 < uint64
// Morover duplicates and mixed are sorted to the beginning
sort.Slice(repackCandidates, func(i, j int) bool {
pi := repackCandidates[i].packInfo
pj := repackCandidates[j].packInfo
switch {
case pi.duplicateBlobs > 0 && pj.duplicateBlobs == 0:
return true
case pj.duplicateBlobs > 0 && pi.duplicateBlobs == 0:
return false
case pi.tpe == restic.InvalidBlob && pj.tpe != restic.InvalidBlob:
return true
case pj.tpe == restic.InvalidBlob && pi.tpe != restic.InvalidBlob:
return false
//case opts.RepackSmall && pi.unusedSize+pi.usedSize < repository.MinPackSize && pj.unusedSize+pj.usedSize >= repository.MinPackSize:
// return true
//case opts.RepackSmall && pj.unusedSize+pj.usedSize < repository.MinPackSize && pi.unusedSize+pi.usedSize >= repository.MinPackSize:
// return false
}
return pi.unusedSize*pj.usedSize > pj.unusedSize*pi.usedSize
})
for _, p := range repackCandidates {
reachedUnusedSizeAfter := (stats.size.unused-stats.size.remove-stats.size.repackrm < maxUnusedSizeAfter)
reachedRepackSize := (len(opts.MaxRepackSize) > 0 && stats.size.repack+p.unusedSize+p.usedSize > opts.MaxRepackBytes)
switch {
case !reachedRepackSize && (p.duplicateBlobs > 0 || p.tpe == restic.InvalidBlob):
// repacking duplicates/mixed is only limited by repackSize
repack(p.ID, p.packInfo)
case reachedUnusedSizeAfter, reachedRepackSize:
// for all other packs stop repacking if tolerated unused size is reached.
stats.packs.keep++
if p.duplicateBlobs > 0 {
repackAllPacksWithDuplicates = false
}
default:
repack(p.ID, p.packInfo)
}
}
// if all duplicates are repacked, print out correct statistics
if repackAllPacksWithDuplicates {
stats.blobs.repackrm += stats.blobs.duplicate
stats.size.repackrm += stats.size.duplicate
}
Verboseff("\nused: %10d blobs / %s\n", stats.blobs.used, formatBytes(stats.size.used))
if stats.blobs.duplicate > 0 {
Verboseff("duplicates: %10d blobs / %s\n", stats.blobs.duplicate, formatBytes(stats.size.duplicate))
}
Verboseff("unused: %10d blobs / %s\n", stats.blobs.unused, formatBytes(stats.size.unused))
if stats.size.unref > 0 {
Verboseff("unreferenced: %s\n", formatBytes(stats.size.unref))
}
totalBlobs := stats.blobs.used + stats.blobs.unused + stats.blobs.duplicate
totalSize := stats.size.used + stats.size.duplicate + stats.size.unused + stats.size.unref
Verboseff("total: %10d blobs / %s\n", totalBlobs, formatBytes(totalSize))
Verboseff("unused size: %s of total size\n", formatPercent(stats.size.unused, totalSize))
Verbosef("\nto repack: %10d blobs / %s\n", stats.blobs.repack, formatBytes(stats.size.repack))
Verbosef("this removes %10d blobs / %s\n", stats.blobs.repackrm, formatBytes(stats.size.repackrm))
Verbosef("to delete: %10d blobs / %s\n", stats.blobs.remove, formatBytes(stats.size.remove+stats.size.unref))
totalPruneSize := stats.size.remove + stats.size.repackrm + stats.size.unref
Verbosef("total prune: %10d blobs / %s\n", stats.blobs.remove+stats.blobs.repackrm, formatBytes(totalPruneSize))
Verbosef("remaining: %10d blobs / %s\n", totalBlobs-(stats.blobs.remove+stats.blobs.repackrm), formatBytes(totalSize-totalPruneSize))
unusedAfter := stats.size.unused - stats.size.remove - stats.size.repackrm
Verbosef("unused size after prune: %s (%s of remaining size)\n",
formatBytes(unusedAfter), formatPercent(unusedAfter, totalSize-totalPruneSize))
Verbosef("\n")
Verboseff("totally used packs: %10d\n", stats.packs.used)
Verboseff("partly used packs: %10d\n", stats.packs.partlyUsed)
Verboseff("unused packs: %10d\n\n", stats.packs.unused)
Verboseff("to keep: %10d packs\n", stats.packs.keep)
Verboseff("to repack: %10d packs\n", len(repackPacks))
Verboseff("to delete: %10d packs\n", len(removePacks))
if len(removePacksFirst) > 0 {
Verboseff("to delete: %10d unreferenced packs\n\n", len(removePacksFirst))
}
if opts.DryRun {
if !gopts.JSON && gopts.verbosity >= 2 {
if len(removePacksFirst) > 0 {
Printf("Would have removed the following unreferenced packs:\n%v\n\n", removePacksFirst)
}
Printf("Would have repacked and removed the following packs:\n%v\n\n", repackPacks)
Printf("Would have removed the following no longer used packs:\n%v\n\n", removePacks)
}
// Always quit here if DryRun was set!
return nil
}
// unreferenced packs can be safely deleted first
if len(removePacksFirst) != 0 {
Verbosef("deleting unreferenced packs\n")
DeleteFiles(gopts, repo, removePacksFirst, restic.PackFile)
}
if len(repackPacks) != 0 {
Verbosef("repacking packs\n")
bar := newProgressMax(!gopts.Quiet, uint64(len(repackPacks)), "packs repacked")
_, err := repository.Repack(ctx, repo, repackPacks, keepBlobs, bar)
if err != nil {
return err
}
// Also remove repacked packs
removePacks.Merge(repackPacks)
}
if len(removePacks) != 0 {
Verbosef("remove %d old packs\n", len(removePacks))
if err = rebuildIndex(ctx, repo, removePacks); err != nil {
return err
}
Verbosef("removing %d old packs\n", len(removePacks))
DeleteFiles(gopts, repo, removePacks, restic.PackFile)
}
@ -263,7 +477,7 @@ func pruneRepository(gopts GlobalOptions, repo restic.Repository) error {
func getUsedBlobs(gopts GlobalOptions, repo restic.Repository, snapshots []*restic.Snapshot) (usedBlobs restic.BlobSet, err error) {
ctx := gopts.ctx
Verbosef("find data that is still in use for %d snapshots\n", len(snapshots))
Verbosef("finding data that is still in use for %d snapshots\n", len(snapshots))
usedBlobs = restic.NewBlobSet()

View file

@ -270,8 +270,8 @@ func testRunForgetJSON(t testing.TB, gopts GlobalOptions, args ...string) {
"Expected 2 snapshots to be removed, got %v", len(forgets[0].Remove))
}
func testRunPrune(t testing.TB, gopts GlobalOptions) {
rtest.OK(t, runPrune(gopts))
func testRunPrune(t testing.TB, gopts GlobalOptions, opts PruneOptions) {
rtest.OK(t, runPrune(opts, gopts))
}
func testSetupBackupData(t testing.TB, env *testEnvironment) string {
@ -1386,6 +1386,41 @@ func TestCheckRestoreNoLock(t *testing.T) {
}
func TestPrune(t *testing.T) {
t.Run("0", func(t *testing.T) {
opts := PruneOptions{MaxUnusedPercent: 0.0}
checkOpts := CheckOptions{ReadData: true, CheckUnused: true}
testPrune(t, opts, checkOpts)
})
t.Run("50", func(t *testing.T) {
opts := PruneOptions{MaxUnusedPercent: 50.0}
checkOpts := CheckOptions{ReadData: true}
testPrune(t, opts, checkOpts)
})
t.Run("100", func(t *testing.T) {
opts := PruneOptions{MaxUnusedPercent: 100.0}
checkOpts := CheckOptions{ReadData: true}
testPrune(t, opts, checkOpts)
})
t.Run("CachableOnly", func(t *testing.T) {
opts := PruneOptions{RepackCachableOnly: true}
checkOpts := CheckOptions{ReadData: true}
testPrune(t, opts, checkOpts)
})
/* repack-small option will come in future
t.Run("Small", func(t *testing.T) {
opts = PruneOptions{MaxUnusedPercent: 100.0, RepackSmall: true}
// The test case only produces small files; hence no unused blobs should remain.
checkOpts = CheckOptions{ReadData: true, CheckUnused: true}
testPrune(t, opts, checkOpts)
})
*/
}
func testPrune(t *testing.T, pruneOpts PruneOptions, checkOpts CheckOptions) {
env, cleanup := withTestEnvironment(t)
defer cleanup()
@ -1406,10 +1441,12 @@ func TestPrune(t *testing.T) {
testRunForgetJSON(t, env.gopts)
testRunForget(t, env.gopts, firstSnapshot[0].String())
testRunPrune(t, env.gopts)
testRunCheck(t, env.gopts)
testRunPrune(t, env.gopts, pruneOpts)
rtest.OK(t, runCheck(checkOpts, env.gopts, nil))
}
var pruneDefaultOptions = PruneOptions{MaxUnusedPercent: 1.5}
func listPacks(gopts GlobalOptions, t *testing.T) restic.IDSet {
r, err := OpenRepository(gopts)
rtest.OK(t, err)
@ -1452,14 +1489,8 @@ func TestPruneWithDamagedRepository(t *testing.T) {
"expected one snapshot, got %v", snapshotIDs)
// prune should fail
err := runPrune(env.gopts)
if err == nil {
t.Fatalf("expected prune to fail")
}
if !strings.Contains(err.Error(), "blobs seem to be missing") {
t.Fatalf("did not find hint for missing blobs")
}
t.Log(err)
rtest.Assert(t, runPrune(pruneDefaultOptions, env.gopts) == errorPacksMissing,
"prune should have reported index not complete error")
}
// Test repos for edge cases
@ -1469,37 +1500,37 @@ func TestEdgeCaseRepos(t *testing.T) {
// repo where index is completely missing
// => check and prune should fail
t.Run("no-index", func(t *testing.T) {
testEdgeCaseRepo(t, "repo-index-missing.tar.gz", opts, false, false)
testEdgeCaseRepo(t, "repo-index-missing.tar.gz", opts, pruneDefaultOptions, false, false)
})
// repo where an existing and used blob is missing from the index
// => check should fail, prune should heal this
// => check and prune should fail
t.Run("index-missing-blob", func(t *testing.T) {
testEdgeCaseRepo(t, "repo-index-missing-blob.tar.gz", opts, false, true)
testEdgeCaseRepo(t, "repo-index-missing-blob.tar.gz", opts, pruneDefaultOptions, false, false)
})
// repo where a blob is missing
// => check and prune should fail
t.Run("no-data", func(t *testing.T) {
testEdgeCaseRepo(t, "repo-data-missing.tar.gz", opts, false, false)
testEdgeCaseRepo(t, "repo-data-missing.tar.gz", opts, pruneDefaultOptions, false, false)
})
// repo where data exists that is not referenced
// => check and prune should fully work
t.Run("unreferenced-data", func(t *testing.T) {
testEdgeCaseRepo(t, "repo-unreferenced-data.tar.gz", opts, true, true)
testEdgeCaseRepo(t, "repo-unreferenced-data.tar.gz", opts, pruneDefaultOptions, true, true)
})
// repo where an obsolete index still exists
// => check and prune should fully work
t.Run("obsolete-index", func(t *testing.T) {
testEdgeCaseRepo(t, "repo-obsolete-index.tar.gz", opts, true, true)
testEdgeCaseRepo(t, "repo-obsolete-index.tar.gz", opts, pruneDefaultOptions, true, true)
})
// repo which contains mixed (data/tree) packs
// => check and prune should fully work
t.Run("mixed-packs", func(t *testing.T) {
testEdgeCaseRepo(t, "repo-mixed.tar.gz", opts, true, true)
testEdgeCaseRepo(t, "repo-mixed.tar.gz", opts, pruneDefaultOptions, true, true)
})
// repo which contains duplicate blobs
@ -1510,11 +1541,11 @@ func TestEdgeCaseRepos(t *testing.T) {
CheckUnused: true,
}
t.Run("duplicates", func(t *testing.T) {
testEdgeCaseRepo(t, "repo-duplicates.tar.gz", opts, false, true)
testEdgeCaseRepo(t, "repo-duplicates.tar.gz", opts, pruneDefaultOptions, false, true)
})
}
func testEdgeCaseRepo(t *testing.T, tarfile string, options CheckOptions, checkOK, pruneOK bool) {
func testEdgeCaseRepo(t *testing.T, tarfile string, optionsCheck CheckOptions, optionsPrune PruneOptions, checkOK, pruneOK bool) {
env, cleanup := withTestEnvironment(t)
defer cleanup()
@ -1524,15 +1555,15 @@ func testEdgeCaseRepo(t *testing.T, tarfile string, options CheckOptions, checkO
if checkOK {
testRunCheck(t, env.gopts)
} else {
rtest.Assert(t, runCheck(options, env.gopts, nil) != nil,
rtest.Assert(t, runCheck(optionsCheck, env.gopts, nil) != nil,
"check should have reported an error")
}
if pruneOK {
testRunPrune(t, env.gopts)
testRunPrune(t, env.gopts, optionsPrune)
testRunCheck(t, env.gopts)
} else {
rtest.Assert(t, runPrune(env.gopts) != nil,
rtest.Assert(t, runPrune(optionsPrune, env.gopts) != nil,
"prune should have reported an error")
}
}

View file

@ -23,12 +23,11 @@ data that was referenced by the snapshot from the repository. This can
be automated with the ``--prune`` option of the ``forget`` command,
which runs ``prune`` automatically if snapshots have been removed.
.. Warning::
Pruning snapshots can be a very time-consuming process, taking nearly
as long as backups themselves. During a prune operation, the index is
locked and backups cannot be completed. Performance improvements are
planned for this feature.
Pruning snapshots can be a time-consuming process, depending on the
amount of snapshots and data to process. During a prune operation, the
repository is locked and backups cannot be completed. Please plan your
pruning so that there's time to complete it and it doesn't interfere with
regular backup runs.
It is advisable to run ``restic check`` after pruning, to make sure
you are alerted, should the internal data structures of the repository
@ -82,20 +81,32 @@ command must be run:
$ restic -r /srv/restic-repo prune
enter password for repository:
repository 33002c5e opened successfully, password is correct
loading all snapshots...
loading indexes...
finding data that is still in use for 4 snapshots
[0:00] 100.00% 4 / 4 snapshots
searching used packs...
collecting packs for deletion and repacking
[0:00] 100.00% 5 / 5 packs processed
to repack: 69 blobs / 1.078 MiB
this removes 67 blobs / 1.047 MiB
to delete: 7 blobs / 25.726 KiB
total prune: 74 blobs / 1.072 MiB
remaining: 16 blobs / 38.003 KiB
unused size after prune: 0 B (0.00% of remaining size)
repacking packs
[0:00] 100.00% 2 / 2 packs repacked
counting files in repo
building new index for repo
[0:00] 100.00% 22 / 22 files
repository contains 22 packs (8512 blobs) with 100.092 MiB bytes
processed 8512 blobs: 0 duplicate blobs, 0B duplicate
load all snapshots
find data that is still in use for 1 snapshots
[0:00] 100.00% 1 / 1 snapshots
found 8433 of 8512 data blobs still in use
will rewrite 3 packs
creating new index
[0:00] 86.36% 19 / 22 files
saved new index as 544a5084
[0:00] 100.00% 3 / 3 packs
finding old index files
saved new indexes as [59270b3a]
remove 4 old index files
[0:00] 100.00% 4 / 4 files deleted
removing 3 old packs
[0:00] 100.00% 3 / 3 files deleted
done
Afterwards the repository is smaller.
@ -119,19 +130,31 @@ to ``forget``:
8c02b94b 2017-02-21 10:48:33 mopped /home/user/work
1 snapshots have been removed, running prune
counting files in repo
building new index for repo
[0:00] 100.00% 37 / 37 packs
repository contains 37 packs (5521 blobs) with 151.012 MiB bytes
processed 5521 blobs: 0 duplicate blobs, 0B duplicate
load all snapshots
find data that is still in use for 1 snapshots
loading all snapshots...
loading indexes...
finding data that is still in use for 1 snapshots
[0:00] 100.00% 1 / 1 snapshots
found 5323 of 5521 data blobs still in use, removing 198 blobs
will delete 0 packs and rewrite 27 packs, this frees 22.106 MiB
creating new index
[0:00] 100.00% 30 / 30 packs
saved new index as b49f3e68
searching used packs...
collecting packs for deletion and repacking
[0:00] 100.00% 5 / 5 packs processed
to repack: 69 blobs / 1.078 MiB
this removes 67 blobs / 1.047 MiB
to delete: 7 blobs / 25.726 KiB
total prune: 74 blobs / 1.072 MiB
remaining: 16 blobs / 38.003 KiB
unused size after prune: 0 B (0.00% of remaining size)
repacking packs
[0:00] 100.00% 2 / 2 packs repacked
counting files in repo
[0:00] 100.00% 3 / 3 packs
finding old index files
saved new indexes as [59270b3a]
remove 4 old index files
[0:00] 100.00% 4 / 4 files deleted
removing 3 old packs
[0:00] 100.00% 3 / 3 files deleted
done
Removing snapshots according to a policy
@ -282,3 +305,44 @@ last-day-of-the-months (11 or 12 depends if the 5 weeklies cross a month).
And finally 75 last-day-of-the-year snapshots. All other snapshots are
removed.
Customize pruning
*****************
To understand the custom options, we first explain how the pruning process works:
- First all snapshots and directories within snapshots are scanned to determine
which data is still in use.
- Then for all pack files ``prune`` finds out if the file is fully used, partly
used or completely unused.
- Completely unused packs are marked for deletion. Fully used packs are kept.
A partially used pack is either kept or marked for repacking depending on user
options.
Note that for repacking, restic must download the file from the repository
storage and reupload the needed data in the repository. This can be very
time-consuming for remote repositories.
- After deciding what to do, ``prune`` will actually perform the repack, modify
the index according to the changes and delete the obsolete files.
The ``prune`` command accepts the following options:
- ``--max-unused limit`` allow unused data up to the specified limit within the repository.
This allows restic to keep partly used packs instead of repacking them.
The limit can be specified as size, e.g. "200M" or in percentage with respect to the total
repository size, e.g. "0.5%".
``prune`` tries to repack as little data as possible while still ensuring this
limit for unused data.
If you want to minimize the space used by your repository, use a value of 0%.
If you want to minimize the time and bandwidth used by the ``prune`` command, use a
high value. A value of 100% will not require any pack file to be repacked.
The default value is 5%.
- ``--max-repack-size size`` if set limits the total size of packs to repack.
As ``prune`` first stores all repacked packs and deletes the obsolete packs at the end,
this option might be handy if you expect many packs to be repacked and fear to run low
on storage.
- ``--repack-cacheable-only`` if set to true only pack files which are cacheable are repacked.
Other pack files are not repacked, if this option is set.
This allows a very fast repacking using only cached data. It can, however, imply that the
unused data in your repository exceeds the value given by ``--max-unused-percent``.
The default value is false.
- ``--dry-run`` only show what ``prune`` would do.
- ``--verbose`` increased verbosity shows additional statistics for ``prune``.