Merge pull request #1729 from mholt/stats

Implement `restic stats` command to get more info about a repository
This commit is contained in:
Alexander Neumann 2018-07-31 23:24:36 +02:00
commit 3422c1ca83
4 changed files with 390 additions and 0 deletions

View file

@ -0,0 +1,4 @@
Enhancement: Add stats command to get information about a repository
https://github.com/restic/restic/issues/874
https://github.com/restic/restic/pull/1729

314
cmd/restic/cmd_stats.go Normal file
View file

@ -0,0 +1,314 @@
package main
import (
"context"
"crypto/sha256"
"encoding/json"
"fmt"
"os"
"path/filepath"
"github.com/restic/restic/internal/restic"
"github.com/restic/restic/internal/walker"
"github.com/spf13/cobra"
)
var cmdStats = &cobra.Command{
Use: "stats [flags] [snapshot-ID]",
Short: "Scan the repository and show basic statistics",
Long: `
The "stats" command walks one or all snapshots in a repository and
accumulates statistics about the data stored therein. It reports on
the number of unique files and their sizes, according to one of
the counting modes as given by the --mode flag.
If no snapshot is specified, all snapshots will be considered. Some
modes make more sense over just a single snapshot, while others
are useful across all snapshots, depending on what you are trying
to calculate.
The modes are:
restore-size: (default) Counts the size of the restored files.
files-by-contents: Counts total size of files, where a file is
considered unique if it has unique contents.
raw-data: Counts the size of blobs in the repository, regardless
of how many files reference them.
blobs-per-file: A combination of files-by-contents and raw-data.
Refer to the online manual for more details about each mode.
`,
DisableAutoGenTag: true,
RunE: func(cmd *cobra.Command, args []string) error {
return runStats(globalOptions, args)
},
}
func init() {
cmdRoot.AddCommand(cmdStats)
f := cmdStats.Flags()
f.StringVar(&countMode, "mode", countModeRestoreSize, "counting mode: restore-size (default), files-by-contents, blobs-per-file, or raw-data")
f.StringVar(&snapshotByHost, "host", "", "filter latest snapshot by this hostname")
}
func runStats(gopts GlobalOptions, args []string) error {
err := verifyStatsInput(gopts, args)
if err != nil {
return err
}
ctx, cancel := context.WithCancel(gopts.ctx)
defer cancel()
repo, err := OpenRepository(gopts)
if err != nil {
return err
}
if err = repo.LoadIndex(ctx); err != nil {
return err
}
if !gopts.NoLock {
lock, err := lockRepo(repo)
defer unlockRepo(lock)
if err != nil {
return err
}
}
// create a container for the stats (and other needed state)
stats := &statsContainer{
uniqueFiles: make(map[fileID]struct{}),
fileBlobs: make(map[string]restic.IDSet),
blobs: restic.NewBlobSet(),
blobsSeen: restic.NewBlobSet(),
}
if snapshotIDString != "" {
// scan just a single snapshot
var sID restic.ID
if snapshotIDString == "latest" {
sID, err = restic.FindLatestSnapshot(ctx, repo, []string{}, []restic.TagList{}, snapshotByHost)
if err != nil {
Exitf(1, "latest snapshot for criteria not found: %v", err)
}
} else {
sID, err = restic.FindSnapshot(repo, snapshotIDString)
if err != nil {
return err
}
}
snapshot, err := restic.LoadSnapshot(ctx, repo, sID)
if err != nil {
return err
}
err = statsWalkSnapshot(ctx, snapshot, repo, stats)
} else {
// iterate every snapshot in the repo
err = repo.List(ctx, restic.SnapshotFile, func(snapshotID restic.ID, size int64) error {
snapshot, err := restic.LoadSnapshot(ctx, repo, snapshotID)
if err != nil {
return fmt.Errorf("Error loading snapshot %s: %v", snapshotID.Str(), err)
}
return statsWalkSnapshot(ctx, snapshot, repo, stats)
})
}
if err != nil {
return err
}
if countMode == countModeRawData {
// the blob handles have been collected, but not yet counted
for blobHandle := range stats.blobs {
blobSize, found := repo.LookupBlobSize(blobHandle.ID, blobHandle.Type)
if !found {
return fmt.Errorf("blob %v not found", blobHandle)
}
stats.TotalSize += uint64(blobSize)
stats.TotalBlobCount++
}
}
if gopts.JSON {
err = json.NewEncoder(os.Stdout).Encode(stats)
if err != nil {
return fmt.Errorf("encoding output: %v", err)
}
return nil
}
if stats.TotalBlobCount > 0 {
Printf(" Total Blob Count: %d\n", stats.TotalBlobCount)
}
if stats.TotalFileCount > 0 {
Printf(" Total File Count: %d\n", stats.TotalFileCount)
}
Printf(" Total Size: %-5s\n", formatBytes(stats.TotalSize))
return nil
}
func statsWalkSnapshot(ctx context.Context, snapshot *restic.Snapshot, repo restic.Repository, stats *statsContainer) error {
if snapshot.Tree == nil {
return fmt.Errorf("snapshot %s has nil tree", snapshot.ID().Str())
}
if countMode == countModeRawData {
// count just the sizes of unique blobs; we don't need to walk the tree
// ourselves in this case, since a nifty function does it for us
return restic.FindUsedBlobs(ctx, repo, *snapshot.Tree, stats.blobs, stats.blobsSeen)
}
err := walker.Walk(ctx, repo, *snapshot.Tree, restic.NewIDSet(), statsWalkTree(repo, stats))
if err != nil {
return fmt.Errorf("walking tree %s: %v", *snapshot.Tree, err)
}
return nil
}
func statsWalkTree(repo restic.Repository, stats *statsContainer) walker.WalkFunc {
return func(npath string, node *restic.Node, nodeErr error) (bool, error) {
if nodeErr != nil {
return true, nodeErr
}
if node == nil {
return true, nil
}
if countMode == countModeUniqueFilesByContents || countMode == countModeBlobsPerFile {
// only count this file if we haven't visited it before
fid := makeFileIDByContents(node)
if _, ok := stats.uniqueFiles[fid]; !ok {
// mark the file as visited
stats.uniqueFiles[fid] = struct{}{}
if countMode == countModeUniqueFilesByContents {
// simply count the size of each unique file (unique by contents only)
stats.TotalSize += node.Size
stats.TotalFileCount++
}
if countMode == countModeBlobsPerFile {
// count the size of each unique blob reference, which is
// by unique file (unique by contents and file path)
for _, blobID := range node.Content {
// ensure we have this file (by path) in our map; in this
// mode, a file is unique by both contents and path
nodePath := filepath.Join(npath, node.Name)
if _, ok := stats.fileBlobs[nodePath]; !ok {
stats.fileBlobs[nodePath] = restic.NewIDSet()
stats.TotalFileCount++
}
if _, ok := stats.fileBlobs[nodePath][blobID]; !ok {
// is always a data blob since we're accessing it via a file's Content array
blobSize, found := repo.LookupBlobSize(blobID, restic.DataBlob)
if !found {
return true, fmt.Errorf("blob %s not found for tree %s", blobID, *node.Subtree)
}
// count the blob's size, then add this blob by this
// file (path) so we don't double-count it
stats.TotalSize += uint64(blobSize)
stats.fileBlobs[nodePath].Insert(blobID)
// this mode also counts total unique blob _references_ per file
stats.TotalBlobCount++
}
}
}
}
}
if countMode == countModeRestoreSize {
// as this is a file in the snapshot, we can simply count its
// size without worrying about uniqueness, since duplicate files
// will still be restored
stats.TotalSize += node.Size
stats.TotalFileCount++
}
return true, nil
}
}
// makeFileIDByContents returns a hash of the blob IDs of the
// node's Content in sequence.
func makeFileIDByContents(node *restic.Node) fileID {
var bb []byte
for _, c := range node.Content {
bb = append(bb, []byte(c[:])...)
}
return sha256.Sum256(bb)
}
func verifyStatsInput(gopts GlobalOptions, args []string) error {
// require a recognized counting mode
switch countMode {
case countModeRestoreSize:
case countModeUniqueFilesByContents:
case countModeBlobsPerFile:
case countModeRawData:
default:
return fmt.Errorf("unknown counting mode: %s (use the -h flag to get a list of supported modes)", countMode)
}
// ensure at most one snapshot was specified
if len(args) > 1 {
return fmt.Errorf("only one snapshot may be specified")
}
// if a snapshot was specified, mark it as the one to scan
if len(args) == 1 {
snapshotIDString = args[0]
}
return nil
}
// statsContainer holds information during a walk of a repository
// to collect information about it, as well as state needed
// for a successful and efficient walk.
type statsContainer struct {
TotalSize uint64 `json:"total_size"`
TotalFileCount uint64 `json:"total_file_count"`
TotalBlobCount uint64 `json:"total_blob_count,omitempty"`
// uniqueFiles marks visited files according to their
// contents (hashed sequence of content blob IDs)
uniqueFiles map[fileID]struct{}
// fileBlobs maps a file name (path) to the set of
// blobs that have been seen as a part of the file
fileBlobs map[string]restic.IDSet
// blobs and blobsSeen are used to count indiviudal
// unique blobs, independent of references to files
blobs, blobsSeen restic.BlobSet
}
// fileID is a 256-bit hash that distinguishes unique files.
type fileID [32]byte
var (
// the mode of counting to perform
countMode string
// the snapshot to scan, as given by the user
snapshotIDString string
// snapshotByHost is the host to filter latest
// snapshot by, if given by user
snapshotByHost string
)
const (
countModeRestoreSize = "restore-size"
countModeUniqueFilesByContents = "files-by-contents"
countModeBlobsPerFile = "blobs-per-file"
countModeRawData = "raw-data"
)

View file

@ -1310,6 +1310,7 @@ _restic_root_command()
commands+=("rebuild-index")
commands+=("restore")
commands+=("snapshots")
commands+=("stats")
commands+=("tag")
commands+=("unlock")
commands+=("version")

View file

@ -36,6 +36,7 @@ Usage help is available:
rebuild-index Build a new index file
restore Extract the data from a snapshot
snapshots List all snapshots
stats Count up sizes and show information about repository data
tag Modify tags on snapshots
unlock Remove locks other processes created
version Print version information
@ -236,6 +237,76 @@ The following metadata is handled by restic:
- Subtree
- ExtendedAttributes
Getting information about repository data
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Use the ``stats`` command to count up stats about the data in the repository.
There are different counting modes available using the ``--mode`` flag,
depending on what you want to calculate. The default is the restore size, or
the size required to restore the files:
- ``restore-size`` (default) counts the size of the restored files.
- ``files-by-contents`` counts the total size of unique files as given by their
contents. This can be useful since a file is considered unique only if it has
unique contents. Keep in mind that a small change to a large file (even when the
file name/path hasn't changed) will cause them to look like different files, thus
essentially causing the whole size of the file to be counted twice.
- ``raw-data`` counts the size of the blobs in the repository, regardless of how many
files reference them. This tells you how much restic has reduced all your original
data down to (either for a single snapshot or across all your backups), and compared
to the size given by the restore-size mode, can tell you how much deduplication is
helping you.
- ``blobs-per-file`` is kind of a mix between files-by-contents and raw-data modes;
it is useful for knowing how much value your backup is providing you in terms of unique
data stored by file. Like files-by-contents, it is resilient to file renames/moves.
Unlike files-by-contents, it does not balloon to high values when large files have
small edits, as long as the file path stayed the same. Unlike raw-data, this mode
DOES consider how many files point to each blob such that the more files a blob is
referenced by, the more it counts toward the size.
For example, to calculate how much space would be
required to restore the latest snapshot (from any host that made it):
.. code-block:: console
$ restic stats latest
password is correct
Total File Count: 10538
Total Size: 37.824 GiB
If multiple hosts are backing up to the repository, the latest snapshot may not
be the one you want. You can specify the latest snapshot from only a specific
host by using the ``--host`` flag:
.. code-block:: console
$ restic stats --host myserver latest
password is correct
Total File Count: 21766
Total Size: 481.783 GiB
There we see that it would take 482 GiB of disk space to restore the latest
snapshot from "myserver".
But how much space does that snapshot take on disk? In other words, how much
has restic's deduplication helped? We can check:
.. code-block:: console
$ restic stats --host myserver --mode raw-data latest
password is correct
Total Blob Count: 340847
Total Size: 458.663 GiB
Comparing this size to the previous command, we see that restic has saved
about 23 GiB of space with deduplication.
Which mode you use depends on your exact use case. Some modes are more useful
across all snapshots, while others make more sense on just a single snapshot,
depending on what you're trying to calculate.
Scripting
---------