Check data subset: check random percentage subset
This commit is contained in:
parent
3c0c0c132b
commit
8f9cea8cc0
4 changed files with 257 additions and 35 deletions
8
changelog/unreleased/pull-3038
Normal file
8
changelog/unreleased/pull-3038
Normal file
|
@ -0,0 +1,8 @@
|
|||
Enhancement: Allow specifying percentage in check's --read-data-subset
|
||||
|
||||
We've enhanced the command-line option --read-data-subset to also accept a
|
||||
percentage. This will check the given percentage of pack files which are
|
||||
randomly selected on each run.
|
||||
|
||||
https://github.com/restic/restic/pull/3038
|
||||
https://github.com/restic/restic/issues/2186
|
|
@ -1,8 +1,8 @@
|
|||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"io/ioutil"
|
||||
"math/rand"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
|
@ -53,25 +53,38 @@ func init() {
|
|||
|
||||
f := cmdCheck.Flags()
|
||||
f.BoolVar(&checkOptions.ReadData, "read-data", false, "read all data blobs")
|
||||
f.StringVar(&checkOptions.ReadDataSubset, "read-data-subset", "", "read subset n of m data packs (format: `n/m`)")
|
||||
f.StringVar(&checkOptions.ReadDataSubset, "read-data-subset", "", "read a `subset` of data packs, specified as 'n/t' for specific subset or either 'x%' or 'x.y%' for random subset")
|
||||
f.BoolVar(&checkOptions.CheckUnused, "check-unused", false, "find unused blobs")
|
||||
f.BoolVar(&checkOptions.WithCache, "with-cache", false, "use the cache")
|
||||
}
|
||||
|
||||
func checkFlags(opts CheckOptions) error {
|
||||
if opts.ReadData && opts.ReadDataSubset != "" {
|
||||
return errors.Fatalf("check flags --read-data and --read-data-subset cannot be used together")
|
||||
return errors.Fatal("check flags --read-data and --read-data-subset cannot be used together")
|
||||
}
|
||||
if opts.ReadDataSubset != "" {
|
||||
dataSubset, err := stringToIntSlice(opts.ReadDataSubset)
|
||||
if err != nil || len(dataSubset) != 2 {
|
||||
return errors.Fatalf("check flag --read-data-subset must have two positive integer values, e.g. --read-data-subset=1/2")
|
||||
}
|
||||
if dataSubset[0] == 0 || dataSubset[1] == 0 || dataSubset[0] > dataSubset[1] {
|
||||
return errors.Fatalf("check flag --read-data-subset=n/t values must be positive integers, and n <= t, e.g. --read-data-subset=1/2")
|
||||
}
|
||||
if dataSubset[1] > totalBucketsMax {
|
||||
return errors.Fatalf("check flag --read-data-subset=n/t t must be at most %d", totalBucketsMax)
|
||||
argumentError := errors.Fatal("check flag --read-data-subset must have two positive integer values or a percentage, e.g. --read-data-subset=1/2 or --read-data-subset=2.5%%")
|
||||
if err == nil {
|
||||
if len(dataSubset) != 2 {
|
||||
return argumentError
|
||||
}
|
||||
if dataSubset[0] == 0 || dataSubset[1] == 0 || dataSubset[0] > dataSubset[1] {
|
||||
return errors.Fatal("check flag --read-data-subset=n/t values must be positive integers, and n <= t, e.g. --read-data-subset=1/2")
|
||||
}
|
||||
if dataSubset[1] > totalBucketsMax {
|
||||
return errors.Fatalf("check flag --read-data-subset=n/t t must be at most %d", totalBucketsMax)
|
||||
}
|
||||
} else {
|
||||
percentage, err := parsePercentage(opts.ReadDataSubset)
|
||||
if err != nil {
|
||||
return argumentError
|
||||
}
|
||||
|
||||
if percentage <= 0.0 || percentage > 100.0 {
|
||||
return errors.Fatal(
|
||||
"check flag --read-data-subset=n% n must be above 0.0% and 100.0%")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -98,6 +111,21 @@ func stringToIntSlice(param string) (split []uint, err error) {
|
|||
return result, nil
|
||||
}
|
||||
|
||||
// ParsePercentage parses a percentage string of the form "X%" where X is a float constant,
|
||||
// and returns the value of that constant. It does not check the range of the value.
|
||||
func parsePercentage(s string) (float64, error) {
|
||||
if !strings.HasSuffix(s, "%") {
|
||||
return 0, errors.Errorf(`parsePercentage: %q does not end in "%%"`, s)
|
||||
}
|
||||
s = s[:len(s)-1]
|
||||
|
||||
p, err := strconv.ParseFloat(s, 64)
|
||||
if err != nil {
|
||||
return 0, errors.Errorf("parsePercentage: %v", err)
|
||||
}
|
||||
return p, nil
|
||||
}
|
||||
|
||||
// prepareCheckCache configures a special cache directory for check.
|
||||
//
|
||||
// * if --with-cache is specified, the default cache is used
|
||||
|
@ -233,23 +261,9 @@ func runCheck(opts CheckOptions, gopts GlobalOptions, args []string) error {
|
|||
}
|
||||
}
|
||||
|
||||
doReadData := func(bucket, totalBuckets uint) {
|
||||
packs := make(map[restic.ID]int64)
|
||||
for pack, size := range chkr.GetPacks() {
|
||||
// If we ever check more than the first byte
|
||||
// of pack, update totalBucketsMax.
|
||||
if (uint(pack[0]) % totalBuckets) == (bucket - 1) {
|
||||
packs[pack] = size
|
||||
}
|
||||
}
|
||||
doReadData := func(packs map[restic.ID]int64) {
|
||||
packCount := uint64(len(packs))
|
||||
|
||||
if packCount < chkr.CountPacks() {
|
||||
Verbosef(fmt.Sprintf("read group #%d of %d data packs (out of total %d packs in %d groups)\n", bucket, packCount, chkr.CountPacks(), totalBuckets))
|
||||
} else {
|
||||
Verbosef("read all data\n")
|
||||
}
|
||||
|
||||
p := newProgressMax(!gopts.Quiet, packCount, "packs")
|
||||
errChan := make(chan error)
|
||||
|
||||
|
@ -264,10 +278,26 @@ func runCheck(opts CheckOptions, gopts GlobalOptions, args []string) error {
|
|||
|
||||
switch {
|
||||
case opts.ReadData:
|
||||
doReadData(1, 1)
|
||||
Verbosef("read all data\n")
|
||||
doReadData(selectPacksByBucket(chkr.GetPacks(), 1, 1))
|
||||
case opts.ReadDataSubset != "":
|
||||
dataSubset, _ := stringToIntSlice(opts.ReadDataSubset)
|
||||
doReadData(dataSubset[0], dataSubset[1])
|
||||
var packs map[restic.ID]int64
|
||||
dataSubset, err := stringToIntSlice(opts.ReadDataSubset)
|
||||
if err == nil {
|
||||
bucket := dataSubset[0]
|
||||
totalBuckets := dataSubset[1]
|
||||
packs = selectPacksByBucket(chkr.GetPacks(), bucket, totalBuckets)
|
||||
packCount := uint64(len(packs))
|
||||
Verbosef("read group #%d of %d data packs (out of total %d packs in %d groups)\n", bucket, packCount, chkr.CountPacks(), totalBuckets)
|
||||
} else {
|
||||
percentage, _ := parsePercentage(opts.ReadDataSubset)
|
||||
packs = selectRandomPacksByPercentage(chkr.GetPacks(), percentage)
|
||||
Verbosef("read %.1f%% of data packs\n", percentage)
|
||||
}
|
||||
if packs == nil {
|
||||
return errors.Fatal("internal error: failed to select packs to check")
|
||||
}
|
||||
doReadData(packs)
|
||||
}
|
||||
|
||||
if errorsFound {
|
||||
|
@ -278,3 +308,40 @@ func runCheck(opts CheckOptions, gopts GlobalOptions, args []string) error {
|
|||
|
||||
return nil
|
||||
}
|
||||
|
||||
// selectPacksByBucket selects subsets of packs by ranges of buckets.
|
||||
func selectPacksByBucket(allPacks map[restic.ID]int64, bucket, totalBuckets uint) map[restic.ID]int64 {
|
||||
packs := make(map[restic.ID]int64)
|
||||
for pack, size := range allPacks {
|
||||
// If we ever check more than the first byte
|
||||
// of pack, update totalBucketsMax.
|
||||
if (uint(pack[0]) % totalBuckets) == (bucket - 1) {
|
||||
packs[pack] = size
|
||||
}
|
||||
}
|
||||
return packs
|
||||
}
|
||||
|
||||
// selectRandomPacksByPercentage selects the given percentage of packs which are randomly choosen.
|
||||
func selectRandomPacksByPercentage(allPacks map[restic.ID]int64, percentage float64) map[restic.ID]int64 {
|
||||
packCount := len(allPacks)
|
||||
packsToCheck := int(float64(packCount) * (percentage / 100.0))
|
||||
if packsToCheck < 1 {
|
||||
packsToCheck = 1
|
||||
}
|
||||
idx := rand.Perm(packCount)
|
||||
|
||||
var keys []restic.ID
|
||||
for k := range allPacks {
|
||||
keys = append(keys, k)
|
||||
}
|
||||
|
||||
packs := make(map[restic.ID]int64)
|
||||
|
||||
for i := 0; i < packsToCheck; i++ {
|
||||
id := keys[idx[i]]
|
||||
packs[id] = allPacks[id]
|
||||
}
|
||||
|
||||
return packs
|
||||
}
|
||||
|
|
124
cmd/restic/cmd_check_test.go
Normal file
124
cmd/restic/cmd_check_test.go
Normal file
|
@ -0,0 +1,124 @@
|
|||
package main
|
||||
|
||||
import (
|
||||
"math"
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"github.com/restic/restic/internal/restic"
|
||||
rtest "github.com/restic/restic/internal/test"
|
||||
)
|
||||
|
||||
func TestParsePercentage(t *testing.T) {
|
||||
testCases := []struct {
|
||||
input string
|
||||
output float64
|
||||
expectError bool
|
||||
}{
|
||||
{"0%", 0.0, false},
|
||||
{"1%", 1.0, false},
|
||||
{"100%", 100.0, false},
|
||||
{"123%", 123.0, false},
|
||||
{"123.456%", 123.456, false},
|
||||
{"0.742%", 0.742, false},
|
||||
{"-100%", -100.0, false},
|
||||
{" 1%", 0.0, true},
|
||||
{"1 %", 0.0, true},
|
||||
{"1% ", 0.0, true},
|
||||
}
|
||||
for _, testCase := range testCases {
|
||||
output, err := parsePercentage(testCase.input)
|
||||
|
||||
if testCase.expectError {
|
||||
rtest.Assert(t, err != nil, "Expected error for case %s", testCase.input)
|
||||
rtest.Assert(t, output == 0.0, "Expected output to be 0.0, got %s", output)
|
||||
} else {
|
||||
rtest.Assert(t, err == nil, "Expected no error for case %s", testCase.input)
|
||||
rtest.Assert(t, math.Abs(testCase.output-output) < 0.00001, "Expected %f, got %f",
|
||||
testCase.output, output)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestStringToIntSlice(t *testing.T) {
|
||||
testCases := []struct {
|
||||
input string
|
||||
output []uint
|
||||
expectError bool
|
||||
}{
|
||||
{"3/5", []uint{3, 5}, false},
|
||||
{"1/100", []uint{1, 100}, false},
|
||||
{"abc", nil, true},
|
||||
{"1/a", nil, true},
|
||||
{"/", nil, true},
|
||||
}
|
||||
for _, testCase := range testCases {
|
||||
output, err := stringToIntSlice(testCase.input)
|
||||
|
||||
if testCase.expectError {
|
||||
rtest.Assert(t, err != nil, "Expected error for case %s", testCase.input)
|
||||
rtest.Assert(t, output == nil, "Expected output to be nil, got %s", output)
|
||||
} else {
|
||||
rtest.Assert(t, err == nil, "Expected no error for case %s", testCase.input)
|
||||
rtest.Assert(t, len(output) == 2, "Invalid output length for case %s", testCase.input)
|
||||
rtest.Assert(t, reflect.DeepEqual(output, testCase.output), "Expected %f, got %f",
|
||||
testCase.output, output)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestSelectPacksByBucket(t *testing.T) {
|
||||
var testPacks = make(map[restic.ID]int64)
|
||||
for i := 1; i <= 10; i++ {
|
||||
id := restic.NewRandomID()
|
||||
// ensure relevant part of generated id is reproducable
|
||||
id[0] = byte(i)
|
||||
testPacks[id] = 0
|
||||
}
|
||||
|
||||
selectedPacks := selectPacksByBucket(testPacks, 0, 10)
|
||||
rtest.Assert(t, len(selectedPacks) == 0, "Expected 0 selected packs")
|
||||
|
||||
for i := uint(1); i <= 5; i++ {
|
||||
selectedPacks = selectPacksByBucket(testPacks, i, 5)
|
||||
rtest.Assert(t, len(selectedPacks) == 2, "Expected 2 selected packs")
|
||||
}
|
||||
|
||||
selectedPacks = selectPacksByBucket(testPacks, 1, 1)
|
||||
rtest.Assert(t, len(selectedPacks) == 10, "Expected 10 selected packs")
|
||||
for testPack := range testPacks {
|
||||
_, ok := selectedPacks[testPack]
|
||||
rtest.Assert(t, ok, "Expected input and output to be equal")
|
||||
}
|
||||
}
|
||||
|
||||
func TestSelectRandomPacksByPercentage(t *testing.T) {
|
||||
var testPacks = make(map[restic.ID]int64)
|
||||
for i := 1; i <= 10; i++ {
|
||||
testPacks[restic.NewRandomID()] = 0
|
||||
}
|
||||
|
||||
selectedPacks := selectRandomPacksByPercentage(testPacks, 0.0)
|
||||
rtest.Assert(t, len(selectedPacks) == 1, "Expected 1 selected packs")
|
||||
|
||||
selectedPacks = selectRandomPacksByPercentage(testPacks, 10.0)
|
||||
rtest.Assert(t, len(selectedPacks) == 1, "Expected 1 selected pack")
|
||||
for pack := range selectedPacks {
|
||||
_, ok := testPacks[pack]
|
||||
rtest.Assert(t, ok, "Unexpected selection")
|
||||
}
|
||||
|
||||
selectedPacks = selectRandomPacksByPercentage(testPacks, 50.0)
|
||||
rtest.Assert(t, len(selectedPacks) == 5, "Expected 5 selected packs")
|
||||
for pack := range selectedPacks {
|
||||
_, ok := testPacks[pack]
|
||||
rtest.Assert(t, ok, "Unexpected item in selection")
|
||||
}
|
||||
|
||||
selectedPacks = selectRandomPacksByPercentage(testPacks, 100.0)
|
||||
rtest.Assert(t, len(selectedPacks) == 10, "Expected 10 selected packs")
|
||||
for testPack := range testPacks {
|
||||
_, ok := selectedPacks[testPack]
|
||||
rtest.Assert(t, ok, "Expected input and output to be equal")
|
||||
}
|
||||
}
|
|
@ -238,12 +238,17 @@ integrity of the pack files in the repository, use the ``--read-data`` flag:
|
|||
repository, beware that it might incur higher bandwidth costs than usual
|
||||
and also that it takes more time than the default ``check``.
|
||||
|
||||
Alternatively, use the ``--read-data-subset=n/t`` parameter to check only a
|
||||
subset of the repository pack files at a time. The parameter takes two values,
|
||||
``n`` and ``t``. When the check command runs, all pack files in the repository
|
||||
are logically divided in ``t`` (roughly equal) groups, and only files that
|
||||
belong to group number ``n`` are checked. For example, the following commands
|
||||
check all repository pack files over 5 separate invocations:
|
||||
Alternatively, use the ``--read-data-subset`` parameter to check only a
|
||||
subset of the repository pack files at a time. It supports two ways to select a
|
||||
subset. One selects a specific range of pack files, the other selects a random
|
||||
percentage of pack files.
|
||||
|
||||
Use ``--read-data-subset=n/t`` to check only a subset of the repository pack
|
||||
files at a time. The parameter takes two values, ``n`` and ``t``. When the check
|
||||
command runs, all pack files in the repository are logically divided in ``t``
|
||||
(roughly equal) groups, and only files that belong to group number ``n`` are
|
||||
checked. For example, the following commands check all repository pack files
|
||||
over 5 separate invocations:
|
||||
|
||||
.. code-block:: console
|
||||
|
||||
|
@ -252,3 +257,21 @@ check all repository pack files over 5 separate invocations:
|
|||
$ restic -r /srv/restic-repo check --read-data-subset=3/5
|
||||
$ restic -r /srv/restic-repo check --read-data-subset=4/5
|
||||
$ restic -r /srv/restic-repo check --read-data-subset=5/5
|
||||
|
||||
Use ``--read-data-subset=n%`` to check a randomly choosen subset of the
|
||||
repository pack files. It takes one parameter, ``n``, the percentage of pack
|
||||
files to check as an integer or floating point number. This will not guarantee
|
||||
to cover all available pack files after sufficient runs, but it is easy to
|
||||
automate checking a small subset of data after each backup. For a floating point
|
||||
value the following command may be used:
|
||||
|
||||
.. code-block:: console
|
||||
|
||||
$ restic -r /srv/restic-repo check --read-data-subset=2.5%
|
||||
|
||||
When checking bigger subsets you most likely specify the percentage as an
|
||||
integer:
|
||||
|
||||
.. code-block:: console
|
||||
|
||||
$ restic -r /srv/restic-repo check --read-data-subset=10%
|
||||
|
|
Loading…
Reference in a new issue