Count unique files by blob sequence rather than tree ID

This commit is contained in:
Matthew Holt 2018-04-21 16:33:18 -06:00 committed by Alexander Neumann
parent f7659bd8b0
commit 925b542eb0

View file

@ -2,6 +2,7 @@ package main
import ( import (
"context" "context"
"crypto/sha256"
"encoding/json" "encoding/json"
"fmt" "fmt"
"os" "os"
@ -51,7 +52,7 @@ func runStats(gopts GlobalOptions, args []string) error {
// create a container for the stats, and other state // create a container for the stats, and other state
// needed while walking the trees // needed while walking the trees
stats := &statsContainer{idSet: restic.NewIDSet()} stats := &statsContainer{uniqueFiles: make(map[fileID]struct{}), idSet: make(restic.IDSet)}
// iterate every snapshot in the repo // iterate every snapshot in the repo
err = repo.List(ctx, restic.SnapshotFile, func(snapshotID restic.ID, size int64) error { err = repo.List(ctx, restic.SnapshotFile, func(snapshotID restic.ID, size int64) error {
@ -96,10 +97,18 @@ func walkTree(ctx context.Context, repo restic.Repository, treeID restic.ID, sta
} }
for _, node := range tree.Nodes { for _, node := range tree.Nodes {
// update our stats to account for this node // only count this file if we haven't visited it before
stats.TotalOriginalSize += node.Size fid := makeFileID(node)
stats.TotalCount++ if _, ok := stats.uniqueFiles[fid]; !ok {
// mark the file as visited
stats.uniqueFiles[fid] = struct{}{}
// update our stats to account for this node
stats.TotalOriginalSize += node.Size
stats.TotalCount++
}
// visit subtrees (i.e. directory contents)
if node.Subtree != nil { if node.Subtree != nil {
err = walkTree(ctx, repo, *node.Subtree, stats) err = walkTree(ctx, repo, *node.Subtree, stats)
if err != nil { if err != nil {
@ -111,6 +120,14 @@ func walkTree(ctx context.Context, repo restic.Repository, treeID restic.ID, sta
return nil return nil
} }
func makeFileID(node *restic.Node) fileID {
var bb []byte
for _, c := range node.Content {
bb = append(bb, []byte(c[:])...)
}
return sha256.Sum256(bb)
}
// statsContainer holds information during a walk of a repository // statsContainer holds information during a walk of a repository
// to collect information about it, as well as state needed // to collect information about it, as well as state needed
// for a successful and efficient walk. // for a successful and efficient walk.
@ -118,4 +135,7 @@ type statsContainer struct {
TotalCount uint64 `json:"total_count"` TotalCount uint64 `json:"total_count"`
TotalOriginalSize uint64 `json:"total_original_size"` TotalOriginalSize uint64 `json:"total_original_size"`
idSet restic.IDSet idSet restic.IDSet
uniqueFiles map[fileID]struct{}
} }
type fileID [32]byte