prune: Add unsafe option to recover from no free space

The new option allows prune to operate with nearly no scratch space by only removing
no longer necessary pack files and first deleting the index before
rebuilding it. By first deleting the index it becomes safe to just
delete no longer necessary pack files. However, as a downside there's
now the risk that the repository becomes inaccessible if prune fails.

To recover from that problem a user might have to manually delete the
repository index and then run (a full) `rebuild-index` again.
This commit is contained in:
Michael Eischer 2021-08-16 16:02:01 +02:00
parent cf5cb673fb
commit dbbeac7174
4 changed files with 92 additions and 11 deletions

View file

@ -0,0 +1,9 @@
Enhancement: Support pruning even after running out of disk space
When running out of disk space it was no longer possible to add or remove
data from a repository. To help with recovering from such a deadlock, the
prune command now supports an `--unsafe-recover-no-free-space` option to
recover from such situations. Make sure to read the documentation first!
https://github.com/restic/restic/issues/1153
https://github.com/restic/restic/pull/3481

View file

@ -39,7 +39,10 @@ Exit status is 0 if the command was successful, and non-zero if there was any er
// PruneOptions collects all options for the cleanup command.
type PruneOptions struct {
DryRun bool
DryRun bool
UnsafeNoSpaceRecovery string
unsafeRecovery bool
MaxUnused string
maxUnusedBytes func(used uint64) (unused uint64) // calculates the number of unused bytes after repacking, according to MaxUnused
@ -56,6 +59,7 @@ func init() {
cmdRoot.AddCommand(cmdPrune)
f := cmdPrune.Flags()
f.BoolVarP(&pruneOptions.DryRun, "dry-run", "n", false, "do not modify the repository, just print what would be done")
f.StringVarP(&pruneOptions.UnsafeNoSpaceRecovery, "unsafe-recover-no-free-space", "", "", "UNSAFE, READ THE DOCUMENTATION BEFORE USING! Try to recover a repository stuck with no free space. Do not use without trying out 'prune --max-repack-size 0' first.")
addPruneOptions(cmdPrune)
}
@ -75,6 +79,10 @@ func verifyPruneOptions(opts *PruneOptions) error {
}
opts.MaxRepackBytes = uint64(size)
}
if opts.UnsafeNoSpaceRecovery != "" {
// prevent repacking data to make sure users cannot get stuck.
opts.MaxRepackBytes = 0
}
maxUnused := strings.TrimSpace(opts.MaxUnused)
if maxUnused == "" {
@ -136,6 +144,14 @@ func runPrune(opts PruneOptions, gopts GlobalOptions) error {
return errors.Fatal("prune requires a backend connection limit of at least two")
}
if opts.UnsafeNoSpaceRecovery != "" {
repoID := repo.Config().ID
if opts.UnsafeNoSpaceRecovery != repoID {
return errors.Fatalf("must pass id '%s' to --unsafe-recover-no-free-space", repoID)
}
opts.unsafeRecovery = true
}
lock, err := lockRepoExclusive(gopts.ctx, repo)
defer unlockRepo(lock)
if err != nil {
@ -522,7 +538,14 @@ func prune(opts PruneOptions, gopts GlobalOptions, repo restic.Repository, usedB
ignorePacks.Merge(removePacks)
}
if len(ignorePacks) != 0 {
if opts.unsafeRecovery {
Verbosef("deleting index files\n")
indexFiles := repo.Index().(*repository.MasterIndex).IDs()
err = DeleteFilesChecked(gopts, repo, indexFiles, restic.IndexFile)
if err != nil {
return errors.Fatalf("%s", err)
}
} else if len(ignorePacks) != 0 {
err = rebuildIndexFiles(gopts, repo, ignorePacks, nil)
if err != nil {
return errors.Fatalf("%s", err)
@ -534,11 +557,18 @@ func prune(opts PruneOptions, gopts GlobalOptions, repo restic.Repository, usedB
DeleteFiles(gopts, repo, removePacks, restic.PackFile)
}
if opts.unsafeRecovery {
_, err = writeIndexFiles(gopts, repo, ignorePacks, nil)
if err != nil {
return errors.Fatalf("%s", err)
}
}
Verbosef("done\n")
return nil
}
func rebuildIndexFiles(gopts GlobalOptions, repo restic.Repository, removePacks restic.IDSet, extraObsolete restic.IDs) error {
func writeIndexFiles(gopts GlobalOptions, repo restic.Repository, removePacks restic.IDSet, extraObsolete restic.IDs) (restic.IDSet, error) {
Verbosef("rebuilding index\n")
idx := (repo.Index()).(*repository.MasterIndex)
@ -546,6 +576,11 @@ func rebuildIndexFiles(gopts GlobalOptions, repo restic.Repository, removePacks
bar := newProgressMax(!gopts.Quiet, packcount, "packs processed")
obsoleteIndexes, err := idx.Save(gopts.ctx, repo, removePacks, extraObsolete, bar)
bar.Done()
return obsoleteIndexes, err
}
func rebuildIndexFiles(gopts GlobalOptions, repo restic.Repository, removePacks restic.IDSet, extraObsolete restic.IDs) error {
obsoleteIndexes, err := writeIndexFiles(gopts, repo, removePacks, extraObsolete)
if err != nil {
return err
}

View file

@ -1573,26 +1573,35 @@ func TestCheckRestoreNoLock(t *testing.T) {
}
func TestPrune(t *testing.T) {
t.Run("0", func(t *testing.T) {
opts := PruneOptions{MaxUnused: "0%"}
testPruneVariants(t, false)
testPruneVariants(t, true)
}
func testPruneVariants(t *testing.T, unsafeNoSpaceRecovery bool) {
suffix := ""
if unsafeNoSpaceRecovery {
suffix = "-recovery"
}
t.Run("0"+suffix, func(t *testing.T) {
opts := PruneOptions{MaxUnused: "0%", unsafeRecovery: unsafeNoSpaceRecovery}
checkOpts := CheckOptions{ReadData: true, CheckUnused: true}
testPrune(t, opts, checkOpts)
})
t.Run("50", func(t *testing.T) {
opts := PruneOptions{MaxUnused: "50%"}
t.Run("50"+suffix, func(t *testing.T) {
opts := PruneOptions{MaxUnused: "50%", unsafeRecovery: unsafeNoSpaceRecovery}
checkOpts := CheckOptions{ReadData: true}
testPrune(t, opts, checkOpts)
})
t.Run("unlimited", func(t *testing.T) {
opts := PruneOptions{MaxUnused: "unlimited"}
t.Run("unlimited"+suffix, func(t *testing.T) {
opts := PruneOptions{MaxUnused: "unlimited", unsafeRecovery: unsafeNoSpaceRecovery}
checkOpts := CheckOptions{ReadData: true}
testPrune(t, opts, checkOpts)
})
t.Run("CachableOnly", func(t *testing.T) {
opts := PruneOptions{MaxUnused: "5%", RepackCachableOnly: true}
t.Run("CachableOnly"+suffix, func(t *testing.T) {
opts := PruneOptions{MaxUnused: "5%", RepackCachableOnly: true, unsafeRecovery: unsafeNoSpaceRecovery}
checkOpts := CheckOptions{ReadData: true}
testPrune(t, opts, checkOpts)
})

View file

@ -444,3 +444,31 @@ The ``prune`` command accepts the following options:
- ``--dry-run`` only show what ``prune`` would do.
- ``--verbose`` increased verbosity shows additional statistics for ``prune``.
Recovering from "no free space" errors
**************************************
In some cases when a repository has grown large enough to fill up all disk space or the
allocated quota, then ``prune`` might fail to free space. ``prune`` works in such a way
that a repository remains usable no matter at which point the command is interrupted.
However, this also means that ``prune`` requires some scratch space to work.
In most cases it is sufficient to instruct ``prune`` to use as little scratch space as
possible by running it as ``prune --max-repack-size 0``. Note that for restic versions
before 0.13.0 ``prune --max-repack-size 1`` must be used. Obviously, this can only work
if several snapshots have been removed using ``forget`` before. This then allows the
``prune`` command to actually remove data from the repository. If the command succeeds,
but there is still little free space, then remove a few more snapshots and run ``prune`` again.
If ``prune`` fails to complete, then ``prune --unsafe-recover-no-free-space SOME-ID``
is available as a method of last resort. It allows prune to work with little to no free
space. However, a **failed** ``prune`` run can cause the repository to become
**temporarily unusable**. Therefore, make sure that you have a stable connection to the
repository storage, before running this command. In case the command fails, it may become
necessary to manually remove all files from the `index/` folder of the repository and
run `rebuild-index` afterwards.
To prevent accidental usages of the ``--unsafe-recover-no-free-space`` option it is
necessary to first run ``prune --unsafe-recover-no-free-space SOME-ID`` and then replace
``SOME-ID`` with the requested ID.