Merge pull request #2859 from buschjost/stats-filter-by-tag-and-path

Add filter by tag and path to stats command
This commit is contained in:
MichaelEischer 2020-08-31 22:11:01 +02:00 committed by GitHub
commit 55071ee367
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 67 additions and 88 deletions

View file

@ -0,0 +1,9 @@
Enhancement: Support filtering snapshots by tag and path in the stats command
We've added filtering snapshots by `--tag tagList` and by `--path path` to
the `stats` command. This includes filtering of only 'latest' snapshots or
all snapshots in a repository.
https://github.com/restic/restic/issues/2858
https://github.com/restic/restic/pull/2859
https://forum.restic.net/t/stats-for-a-host-and-filtered-snapshots/3020

View file

@ -6,7 +6,6 @@ import (
"fmt" "fmt"
"path/filepath" "path/filepath"
"github.com/restic/restic/internal/errors"
"github.com/restic/restic/internal/restic" "github.com/restic/restic/internal/restic"
"github.com/restic/restic/internal/walker" "github.com/restic/restic/internal/walker"
@ -15,18 +14,19 @@ import (
) )
var cmdStats = &cobra.Command{ var cmdStats = &cobra.Command{
Use: "stats [flags] [snapshot-ID]", Use: "stats [flags] [snapshot ID] [...]",
Short: "Scan the repository and show basic statistics", Short: "Scan the repository and show basic statistics",
Long: ` Long: `
The "stats" command walks one or all snapshots in a repository and The "stats" command walks one or multiple snapshots in a repository
accumulates statistics about the data stored therein. It reports on and accumulates statistics about the data stored therein. It reports
the number of unique files and their sizes, according to one of on the number of unique files and their sizes, according to one of
the counting modes as given by the --mode flag. the counting modes as given by the --mode flag.
If no snapshot is specified, all snapshots will be considered. Some It operates on all snapshots matching the selection criteria or all
modes make more sense over just a single snapshot, while others snapshots if nothing is specified. The special snapshot ID "latest"
are useful across all snapshots, depending on what you are trying is also supported. Some modes make more sense over
to calculate. just a single snapshot, while others are useful across all snapshots,
depending on what you are trying to calculate.
The modes are: The modes are:
@ -50,11 +50,26 @@ Exit status is 0 if the command was successful, and non-zero if there was any er
}, },
} }
// StatsOptions collects all options for the stats command.
type StatsOptions struct {
// the mode of counting to perform (see consts for available modes)
countMode string
// filter snapshots by, if given by user
Hosts []string
Tags restic.TagLists
Paths []string
}
var statsOptions StatsOptions
func init() { func init() {
cmdRoot.AddCommand(cmdStats) cmdRoot.AddCommand(cmdStats)
f := cmdStats.Flags() f := cmdStats.Flags()
f.StringVar(&countMode, "mode", countModeRestoreSize, "counting mode: restore-size (default), files-by-contents, blobs-per-file, or raw-data") f.StringVar(&statsOptions.countMode, "mode", countModeRestoreSize, "counting mode: restore-size (default), files-by-contents, blobs-per-file or raw-data")
f.StringArrayVarP(&snapshotByHosts, "host", "H", nil, "filter latest snapshot by this hostname (can be specified multiple times)") f.StringArrayVarP(&statsOptions.Hosts, "host", "H", nil, "only consider snapshots with the given `host` (can be specified multiple times)")
f.Var(&statsOptions.Tags, "tag", "only consider snapshots which include this `taglist` in the format `tag[,tag,...]` (can be specified multiple times)")
f.StringArrayVar(&statsOptions.Paths, "path", nil, "only consider snapshots which include this (absolute) `path` (can be specified multiple times)")
} }
func runStats(gopts GlobalOptions, args []string) error { func runStats(gopts GlobalOptions, args []string) error {
@ -89,52 +104,25 @@ func runStats(gopts GlobalOptions, args []string) error {
// create a container for the stats (and other needed state) // create a container for the stats (and other needed state)
stats := &statsContainer{ stats := &statsContainer{
uniqueFiles: make(map[fileID]struct{}), uniqueFiles: make(map[fileID]struct{}),
uniqueInodes: make(map[uint64]struct{}), uniqueInodes: make(map[uint64]struct{}),
fileBlobs: make(map[string]restic.IDSet), fileBlobs: make(map[string]restic.IDSet),
blobs: restic.NewBlobSet(), blobs: restic.NewBlobSet(),
snapshotsCount: 0,
} }
if snapshotIDString != "" { for sn := range FindFilteredSnapshots(ctx, repo, statsOptions.Hosts, statsOptions.Tags, statsOptions.Paths, args) {
// scan just a single snapshot err = statsWalkSnapshot(ctx, sn, repo, stats)
var sID restic.ID
if snapshotIDString == "latest" {
sID, err = restic.FindLatestSnapshot(ctx, repo, []string{}, []restic.TagList{}, snapshotByHosts)
if err != nil {
return errors.Fatalf("latest snapshot for criteria not found: %v", err)
}
} else {
sID, err = restic.FindSnapshot(repo, snapshotIDString)
if err != nil {
return errors.Fatalf("error loading snapshot: %v", err)
}
}
snapshot, err := restic.LoadSnapshot(ctx, repo, sID)
if err != nil {
return errors.Fatalf("error loading snapshot from repo: %v", err)
}
err = statsWalkSnapshot(ctx, snapshot, repo, stats)
if err != nil { if err != nil {
return fmt.Errorf("error walking snapshot: %v", err) return fmt.Errorf("error walking snapshot: %v", err)
} }
} else {
// iterate every snapshot in the repo
err = repo.List(ctx, restic.SnapshotFile, func(snapshotID restic.ID, size int64) error {
snapshot, err := restic.LoadSnapshot(ctx, repo, snapshotID)
if err != nil {
return fmt.Errorf("Error loading snapshot %s: %v", snapshotID.Str(), err)
}
return statsWalkSnapshot(ctx, snapshot, repo, stats)
})
} }
if err != nil { if err != nil {
return err return err
} }
if countMode == countModeRawData { if statsOptions.countMode == countModeRawData {
// the blob handles have been collected, but not yet counted // the blob handles have been collected, but not yet counted
for blobHandle := range stats.blobs { for blobHandle := range stats.blobs {
blobSize, found := repo.LookupBlobSize(blobHandle.ID, blobHandle.Type) blobSize, found := repo.LookupBlobSize(blobHandle.ID, blobHandle.Type)
@ -154,22 +142,16 @@ func runStats(gopts GlobalOptions, args []string) error {
return nil return nil
} }
// inform the user what was scanned and how it was scanned Printf("Stats in %s mode:\n", statsOptions.countMode)
snapshotsScanned := snapshotIDString Printf("Snapshots processed: %d\n", stats.snapshotsCount)
if snapshotsScanned == "latest" {
snapshotsScanned = "the latest snapshot"
} else if snapshotsScanned == "" {
snapshotsScanned = "all snapshots"
}
Printf("Stats for %s in %s mode:\n", snapshotsScanned, countMode)
if stats.TotalBlobCount > 0 { if stats.TotalBlobCount > 0 {
Printf(" Total Blob Count: %d\n", stats.TotalBlobCount) Printf(" Total Blob Count: %d\n", stats.TotalBlobCount)
} }
if stats.TotalFileCount > 0 { if stats.TotalFileCount > 0 {
Printf(" Total File Count: %d\n", stats.TotalFileCount) Printf(" Total File Count: %d\n", stats.TotalFileCount)
} }
Printf(" Total Size: %-5s\n", formatBytes(stats.TotalSize)) Printf(" Total Size: %-5s\n", formatBytes(stats.TotalSize))
return nil return nil
} }
@ -179,7 +161,9 @@ func statsWalkSnapshot(ctx context.Context, snapshot *restic.Snapshot, repo rest
return fmt.Errorf("snapshot %s has nil tree", snapshot.ID().Str()) return fmt.Errorf("snapshot %s has nil tree", snapshot.ID().Str())
} }
if countMode == countModeRawData { stats.snapshotsCount++
if statsOptions.countMode == countModeRawData {
// count just the sizes of unique blobs; we don't need to walk the tree // count just the sizes of unique blobs; we don't need to walk the tree
// ourselves in this case, since a nifty function does it for us // ourselves in this case, since a nifty function does it for us
return restic.FindUsedBlobs(ctx, repo, *snapshot.Tree, stats.blobs) return restic.FindUsedBlobs(ctx, repo, *snapshot.Tree, stats.blobs)
@ -189,6 +173,7 @@ func statsWalkSnapshot(ctx context.Context, snapshot *restic.Snapshot, repo rest
if err != nil { if err != nil {
return fmt.Errorf("walking tree %s: %v", *snapshot.Tree, err) return fmt.Errorf("walking tree %s: %v", *snapshot.Tree, err)
} }
return nil return nil
} }
@ -201,19 +186,19 @@ func statsWalkTree(repo restic.Repository, stats *statsContainer) walker.WalkFun
return true, nil return true, nil
} }
if countMode == countModeUniqueFilesByContents || countMode == countModeBlobsPerFile { if statsOptions.countMode == countModeUniqueFilesByContents || statsOptions.countMode == countModeBlobsPerFile {
// only count this file if we haven't visited it before // only count this file if we haven't visited it before
fid := makeFileIDByContents(node) fid := makeFileIDByContents(node)
if _, ok := stats.uniqueFiles[fid]; !ok { if _, ok := stats.uniqueFiles[fid]; !ok {
// mark the file as visited // mark the file as visited
stats.uniqueFiles[fid] = struct{}{} stats.uniqueFiles[fid] = struct{}{}
if countMode == countModeUniqueFilesByContents { if statsOptions.countMode == countModeUniqueFilesByContents {
// simply count the size of each unique file (unique by contents only) // simply count the size of each unique file (unique by contents only)
stats.TotalSize += node.Size stats.TotalSize += node.Size
stats.TotalFileCount++ stats.TotalFileCount++
} }
if countMode == countModeBlobsPerFile { if statsOptions.countMode == countModeBlobsPerFile {
// count the size of each unique blob reference, which is // count the size of each unique blob reference, which is
// by unique file (unique by contents and file path) // by unique file (unique by contents and file path)
for _, blobID := range node.Content { for _, blobID := range node.Content {
@ -243,7 +228,7 @@ func statsWalkTree(repo restic.Repository, stats *statsContainer) walker.WalkFun
} }
} }
if countMode == countModeRestoreSize { if statsOptions.countMode == countModeRestoreSize {
// as this is a file in the snapshot, we can simply count its // as this is a file in the snapshot, we can simply count its
// size without worrying about uniqueness, since duplicate files // size without worrying about uniqueness, since duplicate files
// will still be restored // will still be restored
@ -275,23 +260,13 @@ func makeFileIDByContents(node *restic.Node) fileID {
func verifyStatsInput(gopts GlobalOptions, args []string) error { func verifyStatsInput(gopts GlobalOptions, args []string) error {
// require a recognized counting mode // require a recognized counting mode
switch countMode { switch statsOptions.countMode {
case countModeRestoreSize: case countModeRestoreSize:
case countModeUniqueFilesByContents: case countModeUniqueFilesByContents:
case countModeBlobsPerFile: case countModeBlobsPerFile:
case countModeRawData: case countModeRawData:
default: default:
return fmt.Errorf("unknown counting mode: %s (use the -h flag to get a list of supported modes)", countMode) return fmt.Errorf("unknown counting mode: %s (use the -h flag to get a list of supported modes)", statsOptions.countMode)
}
// ensure at most one snapshot was specified
if len(args) > 1 {
return fmt.Errorf("only one snapshot may be specified")
}
// if a snapshot was specified, mark it as the one to scan
if len(args) == 1 {
snapshotIDString = args[0]
} }
return nil return nil
@ -320,23 +295,14 @@ type statsContainer struct {
// blobs is used to count individual unique blobs, // blobs is used to count individual unique blobs,
// independent of references to files // independent of references to files
blobs restic.BlobSet blobs restic.BlobSet
// holds count of all considered snapshots
snapshotsCount int
} }
// fileID is a 256-bit hash that distinguishes unique files. // fileID is a 256-bit hash that distinguishes unique files.
type fileID [32]byte type fileID [32]byte
var (
// the mode of counting to perform
countMode string
// the snapshot to scan, as given by the user
snapshotIDString string
// snapshotByHost is the host to filter latest
// snapshot by, if given by user
snapshotByHosts []string
)
const ( const (
countModeRestoreSize = "restore-size" countModeRestoreSize = "restore-size"
countModeUniqueFilesByContents = "files-by-contents" countModeUniqueFilesByContents = "files-by-contents"

View file

@ -22,10 +22,10 @@ func FindFilteredSnapshots(ctx context.Context, repo *repository.Repository, hos
// Process all snapshot IDs given as arguments. // Process all snapshot IDs given as arguments.
for _, s := range snapshotIDs { for _, s := range snapshotIDs {
if s == "latest" { if s == "latest" {
usedFilter = true
id, err = restic.FindLatestSnapshot(ctx, repo, paths, tags, hosts) id, err = restic.FindLatestSnapshot(ctx, repo, paths, tags, hosts)
if err != nil { if err != nil {
Warnf("Ignoring %q, no snapshot matched given filter (Paths:%v Tags:%v Hosts:%v)\n", s, paths, tags, hosts) Warnf("Ignoring %q, no snapshot matched given filter (Paths:%v Tags:%v Hosts:%v)\n", s, paths, tags, hosts)
usedFilter = true
continue continue
} }
} else { } else {

View file

@ -306,6 +306,10 @@ host by using the ``--host`` flag:
There we see that it would take 482 GiB of disk space to restore the latest There we see that it would take 482 GiB of disk space to restore the latest
snapshot from "myserver". snapshot from "myserver".
In case you have multiple backups running from the same host so can also use
``--tag`` and ``--path`` to be more specific about which snapshots you
are looking for.
But how much space does that snapshot take on disk? In other words, how much But how much space does that snapshot take on disk? In other words, how much
has restic's deduplication helped? We can check: has restic's deduplication helped? We can check: