Check data subset: check random percentage subset
This commit is contained in:
parent
3c0c0c132b
commit
8f9cea8cc0
4 changed files with 257 additions and 35 deletions
8
changelog/unreleased/pull-3038
Normal file
8
changelog/unreleased/pull-3038
Normal file
|
@ -0,0 +1,8 @@
|
||||||
|
Enhancement: Allow specifying percentage in check's --read-data-subset
|
||||||
|
|
||||||
|
We've enhanced the command-line option --read-data-subset to also accept a
|
||||||
|
percentage. This will check the given percentage of pack files which are
|
||||||
|
randomly selected on each run.
|
||||||
|
|
||||||
|
https://github.com/restic/restic/pull/3038
|
||||||
|
https://github.com/restic/restic/issues/2186
|
|
@ -1,8 +1,8 @@
|
||||||
package main
|
package main
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"fmt"
|
|
||||||
"io/ioutil"
|
"io/ioutil"
|
||||||
|
"math/rand"
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
|
@ -53,26 +53,39 @@ func init() {
|
||||||
|
|
||||||
f := cmdCheck.Flags()
|
f := cmdCheck.Flags()
|
||||||
f.BoolVar(&checkOptions.ReadData, "read-data", false, "read all data blobs")
|
f.BoolVar(&checkOptions.ReadData, "read-data", false, "read all data blobs")
|
||||||
f.StringVar(&checkOptions.ReadDataSubset, "read-data-subset", "", "read subset n of m data packs (format: `n/m`)")
|
f.StringVar(&checkOptions.ReadDataSubset, "read-data-subset", "", "read a `subset` of data packs, specified as 'n/t' for specific subset or either 'x%' or 'x.y%' for random subset")
|
||||||
f.BoolVar(&checkOptions.CheckUnused, "check-unused", false, "find unused blobs")
|
f.BoolVar(&checkOptions.CheckUnused, "check-unused", false, "find unused blobs")
|
||||||
f.BoolVar(&checkOptions.WithCache, "with-cache", false, "use the cache")
|
f.BoolVar(&checkOptions.WithCache, "with-cache", false, "use the cache")
|
||||||
}
|
}
|
||||||
|
|
||||||
func checkFlags(opts CheckOptions) error {
|
func checkFlags(opts CheckOptions) error {
|
||||||
if opts.ReadData && opts.ReadDataSubset != "" {
|
if opts.ReadData && opts.ReadDataSubset != "" {
|
||||||
return errors.Fatalf("check flags --read-data and --read-data-subset cannot be used together")
|
return errors.Fatal("check flags --read-data and --read-data-subset cannot be used together")
|
||||||
}
|
}
|
||||||
if opts.ReadDataSubset != "" {
|
if opts.ReadDataSubset != "" {
|
||||||
dataSubset, err := stringToIntSlice(opts.ReadDataSubset)
|
dataSubset, err := stringToIntSlice(opts.ReadDataSubset)
|
||||||
if err != nil || len(dataSubset) != 2 {
|
argumentError := errors.Fatal("check flag --read-data-subset must have two positive integer values or a percentage, e.g. --read-data-subset=1/2 or --read-data-subset=2.5%%")
|
||||||
return errors.Fatalf("check flag --read-data-subset must have two positive integer values, e.g. --read-data-subset=1/2")
|
if err == nil {
|
||||||
|
if len(dataSubset) != 2 {
|
||||||
|
return argumentError
|
||||||
}
|
}
|
||||||
if dataSubset[0] == 0 || dataSubset[1] == 0 || dataSubset[0] > dataSubset[1] {
|
if dataSubset[0] == 0 || dataSubset[1] == 0 || dataSubset[0] > dataSubset[1] {
|
||||||
return errors.Fatalf("check flag --read-data-subset=n/t values must be positive integers, and n <= t, e.g. --read-data-subset=1/2")
|
return errors.Fatal("check flag --read-data-subset=n/t values must be positive integers, and n <= t, e.g. --read-data-subset=1/2")
|
||||||
}
|
}
|
||||||
if dataSubset[1] > totalBucketsMax {
|
if dataSubset[1] > totalBucketsMax {
|
||||||
return errors.Fatalf("check flag --read-data-subset=n/t t must be at most %d", totalBucketsMax)
|
return errors.Fatalf("check flag --read-data-subset=n/t t must be at most %d", totalBucketsMax)
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
percentage, err := parsePercentage(opts.ReadDataSubset)
|
||||||
|
if err != nil {
|
||||||
|
return argumentError
|
||||||
|
}
|
||||||
|
|
||||||
|
if percentage <= 0.0 || percentage > 100.0 {
|
||||||
|
return errors.Fatal(
|
||||||
|
"check flag --read-data-subset=n% n must be above 0.0% and 100.0%")
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return nil
|
return nil
|
||||||
|
@ -98,6 +111,21 @@ func stringToIntSlice(param string) (split []uint, err error) {
|
||||||
return result, nil
|
return result, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ParsePercentage parses a percentage string of the form "X%" where X is a float constant,
|
||||||
|
// and returns the value of that constant. It does not check the range of the value.
|
||||||
|
func parsePercentage(s string) (float64, error) {
|
||||||
|
if !strings.HasSuffix(s, "%") {
|
||||||
|
return 0, errors.Errorf(`parsePercentage: %q does not end in "%%"`, s)
|
||||||
|
}
|
||||||
|
s = s[:len(s)-1]
|
||||||
|
|
||||||
|
p, err := strconv.ParseFloat(s, 64)
|
||||||
|
if err != nil {
|
||||||
|
return 0, errors.Errorf("parsePercentage: %v", err)
|
||||||
|
}
|
||||||
|
return p, nil
|
||||||
|
}
|
||||||
|
|
||||||
// prepareCheckCache configures a special cache directory for check.
|
// prepareCheckCache configures a special cache directory for check.
|
||||||
//
|
//
|
||||||
// * if --with-cache is specified, the default cache is used
|
// * if --with-cache is specified, the default cache is used
|
||||||
|
@ -233,23 +261,9 @@ func runCheck(opts CheckOptions, gopts GlobalOptions, args []string) error {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
doReadData := func(bucket, totalBuckets uint) {
|
doReadData := func(packs map[restic.ID]int64) {
|
||||||
packs := make(map[restic.ID]int64)
|
|
||||||
for pack, size := range chkr.GetPacks() {
|
|
||||||
// If we ever check more than the first byte
|
|
||||||
// of pack, update totalBucketsMax.
|
|
||||||
if (uint(pack[0]) % totalBuckets) == (bucket - 1) {
|
|
||||||
packs[pack] = size
|
|
||||||
}
|
|
||||||
}
|
|
||||||
packCount := uint64(len(packs))
|
packCount := uint64(len(packs))
|
||||||
|
|
||||||
if packCount < chkr.CountPacks() {
|
|
||||||
Verbosef(fmt.Sprintf("read group #%d of %d data packs (out of total %d packs in %d groups)\n", bucket, packCount, chkr.CountPacks(), totalBuckets))
|
|
||||||
} else {
|
|
||||||
Verbosef("read all data\n")
|
|
||||||
}
|
|
||||||
|
|
||||||
p := newProgressMax(!gopts.Quiet, packCount, "packs")
|
p := newProgressMax(!gopts.Quiet, packCount, "packs")
|
||||||
errChan := make(chan error)
|
errChan := make(chan error)
|
||||||
|
|
||||||
|
@ -264,10 +278,26 @@ func runCheck(opts CheckOptions, gopts GlobalOptions, args []string) error {
|
||||||
|
|
||||||
switch {
|
switch {
|
||||||
case opts.ReadData:
|
case opts.ReadData:
|
||||||
doReadData(1, 1)
|
Verbosef("read all data\n")
|
||||||
|
doReadData(selectPacksByBucket(chkr.GetPacks(), 1, 1))
|
||||||
case opts.ReadDataSubset != "":
|
case opts.ReadDataSubset != "":
|
||||||
dataSubset, _ := stringToIntSlice(opts.ReadDataSubset)
|
var packs map[restic.ID]int64
|
||||||
doReadData(dataSubset[0], dataSubset[1])
|
dataSubset, err := stringToIntSlice(opts.ReadDataSubset)
|
||||||
|
if err == nil {
|
||||||
|
bucket := dataSubset[0]
|
||||||
|
totalBuckets := dataSubset[1]
|
||||||
|
packs = selectPacksByBucket(chkr.GetPacks(), bucket, totalBuckets)
|
||||||
|
packCount := uint64(len(packs))
|
||||||
|
Verbosef("read group #%d of %d data packs (out of total %d packs in %d groups)\n", bucket, packCount, chkr.CountPacks(), totalBuckets)
|
||||||
|
} else {
|
||||||
|
percentage, _ := parsePercentage(opts.ReadDataSubset)
|
||||||
|
packs = selectRandomPacksByPercentage(chkr.GetPacks(), percentage)
|
||||||
|
Verbosef("read %.1f%% of data packs\n", percentage)
|
||||||
|
}
|
||||||
|
if packs == nil {
|
||||||
|
return errors.Fatal("internal error: failed to select packs to check")
|
||||||
|
}
|
||||||
|
doReadData(packs)
|
||||||
}
|
}
|
||||||
|
|
||||||
if errorsFound {
|
if errorsFound {
|
||||||
|
@ -278,3 +308,40 @@ func runCheck(opts CheckOptions, gopts GlobalOptions, args []string) error {
|
||||||
|
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// selectPacksByBucket selects subsets of packs by ranges of buckets.
|
||||||
|
func selectPacksByBucket(allPacks map[restic.ID]int64, bucket, totalBuckets uint) map[restic.ID]int64 {
|
||||||
|
packs := make(map[restic.ID]int64)
|
||||||
|
for pack, size := range allPacks {
|
||||||
|
// If we ever check more than the first byte
|
||||||
|
// of pack, update totalBucketsMax.
|
||||||
|
if (uint(pack[0]) % totalBuckets) == (bucket - 1) {
|
||||||
|
packs[pack] = size
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return packs
|
||||||
|
}
|
||||||
|
|
||||||
|
// selectRandomPacksByPercentage selects the given percentage of packs which are randomly choosen.
|
||||||
|
func selectRandomPacksByPercentage(allPacks map[restic.ID]int64, percentage float64) map[restic.ID]int64 {
|
||||||
|
packCount := len(allPacks)
|
||||||
|
packsToCheck := int(float64(packCount) * (percentage / 100.0))
|
||||||
|
if packsToCheck < 1 {
|
||||||
|
packsToCheck = 1
|
||||||
|
}
|
||||||
|
idx := rand.Perm(packCount)
|
||||||
|
|
||||||
|
var keys []restic.ID
|
||||||
|
for k := range allPacks {
|
||||||
|
keys = append(keys, k)
|
||||||
|
}
|
||||||
|
|
||||||
|
packs := make(map[restic.ID]int64)
|
||||||
|
|
||||||
|
for i := 0; i < packsToCheck; i++ {
|
||||||
|
id := keys[idx[i]]
|
||||||
|
packs[id] = allPacks[id]
|
||||||
|
}
|
||||||
|
|
||||||
|
return packs
|
||||||
|
}
|
||||||
|
|
124
cmd/restic/cmd_check_test.go
Normal file
124
cmd/restic/cmd_check_test.go
Normal file
|
@ -0,0 +1,124 @@
|
||||||
|
package main
|
||||||
|
|
||||||
|
import (
|
||||||
|
"math"
|
||||||
|
"reflect"
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"github.com/restic/restic/internal/restic"
|
||||||
|
rtest "github.com/restic/restic/internal/test"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestParsePercentage(t *testing.T) {
|
||||||
|
testCases := []struct {
|
||||||
|
input string
|
||||||
|
output float64
|
||||||
|
expectError bool
|
||||||
|
}{
|
||||||
|
{"0%", 0.0, false},
|
||||||
|
{"1%", 1.0, false},
|
||||||
|
{"100%", 100.0, false},
|
||||||
|
{"123%", 123.0, false},
|
||||||
|
{"123.456%", 123.456, false},
|
||||||
|
{"0.742%", 0.742, false},
|
||||||
|
{"-100%", -100.0, false},
|
||||||
|
{" 1%", 0.0, true},
|
||||||
|
{"1 %", 0.0, true},
|
||||||
|
{"1% ", 0.0, true},
|
||||||
|
}
|
||||||
|
for _, testCase := range testCases {
|
||||||
|
output, err := parsePercentage(testCase.input)
|
||||||
|
|
||||||
|
if testCase.expectError {
|
||||||
|
rtest.Assert(t, err != nil, "Expected error for case %s", testCase.input)
|
||||||
|
rtest.Assert(t, output == 0.0, "Expected output to be 0.0, got %s", output)
|
||||||
|
} else {
|
||||||
|
rtest.Assert(t, err == nil, "Expected no error for case %s", testCase.input)
|
||||||
|
rtest.Assert(t, math.Abs(testCase.output-output) < 0.00001, "Expected %f, got %f",
|
||||||
|
testCase.output, output)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestStringToIntSlice(t *testing.T) {
|
||||||
|
testCases := []struct {
|
||||||
|
input string
|
||||||
|
output []uint
|
||||||
|
expectError bool
|
||||||
|
}{
|
||||||
|
{"3/5", []uint{3, 5}, false},
|
||||||
|
{"1/100", []uint{1, 100}, false},
|
||||||
|
{"abc", nil, true},
|
||||||
|
{"1/a", nil, true},
|
||||||
|
{"/", nil, true},
|
||||||
|
}
|
||||||
|
for _, testCase := range testCases {
|
||||||
|
output, err := stringToIntSlice(testCase.input)
|
||||||
|
|
||||||
|
if testCase.expectError {
|
||||||
|
rtest.Assert(t, err != nil, "Expected error for case %s", testCase.input)
|
||||||
|
rtest.Assert(t, output == nil, "Expected output to be nil, got %s", output)
|
||||||
|
} else {
|
||||||
|
rtest.Assert(t, err == nil, "Expected no error for case %s", testCase.input)
|
||||||
|
rtest.Assert(t, len(output) == 2, "Invalid output length for case %s", testCase.input)
|
||||||
|
rtest.Assert(t, reflect.DeepEqual(output, testCase.output), "Expected %f, got %f",
|
||||||
|
testCase.output, output)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestSelectPacksByBucket(t *testing.T) {
|
||||||
|
var testPacks = make(map[restic.ID]int64)
|
||||||
|
for i := 1; i <= 10; i++ {
|
||||||
|
id := restic.NewRandomID()
|
||||||
|
// ensure relevant part of generated id is reproducable
|
||||||
|
id[0] = byte(i)
|
||||||
|
testPacks[id] = 0
|
||||||
|
}
|
||||||
|
|
||||||
|
selectedPacks := selectPacksByBucket(testPacks, 0, 10)
|
||||||
|
rtest.Assert(t, len(selectedPacks) == 0, "Expected 0 selected packs")
|
||||||
|
|
||||||
|
for i := uint(1); i <= 5; i++ {
|
||||||
|
selectedPacks = selectPacksByBucket(testPacks, i, 5)
|
||||||
|
rtest.Assert(t, len(selectedPacks) == 2, "Expected 2 selected packs")
|
||||||
|
}
|
||||||
|
|
||||||
|
selectedPacks = selectPacksByBucket(testPacks, 1, 1)
|
||||||
|
rtest.Assert(t, len(selectedPacks) == 10, "Expected 10 selected packs")
|
||||||
|
for testPack := range testPacks {
|
||||||
|
_, ok := selectedPacks[testPack]
|
||||||
|
rtest.Assert(t, ok, "Expected input and output to be equal")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestSelectRandomPacksByPercentage(t *testing.T) {
|
||||||
|
var testPacks = make(map[restic.ID]int64)
|
||||||
|
for i := 1; i <= 10; i++ {
|
||||||
|
testPacks[restic.NewRandomID()] = 0
|
||||||
|
}
|
||||||
|
|
||||||
|
selectedPacks := selectRandomPacksByPercentage(testPacks, 0.0)
|
||||||
|
rtest.Assert(t, len(selectedPacks) == 1, "Expected 1 selected packs")
|
||||||
|
|
||||||
|
selectedPacks = selectRandomPacksByPercentage(testPacks, 10.0)
|
||||||
|
rtest.Assert(t, len(selectedPacks) == 1, "Expected 1 selected pack")
|
||||||
|
for pack := range selectedPacks {
|
||||||
|
_, ok := testPacks[pack]
|
||||||
|
rtest.Assert(t, ok, "Unexpected selection")
|
||||||
|
}
|
||||||
|
|
||||||
|
selectedPacks = selectRandomPacksByPercentage(testPacks, 50.0)
|
||||||
|
rtest.Assert(t, len(selectedPacks) == 5, "Expected 5 selected packs")
|
||||||
|
for pack := range selectedPacks {
|
||||||
|
_, ok := testPacks[pack]
|
||||||
|
rtest.Assert(t, ok, "Unexpected item in selection")
|
||||||
|
}
|
||||||
|
|
||||||
|
selectedPacks = selectRandomPacksByPercentage(testPacks, 100.0)
|
||||||
|
rtest.Assert(t, len(selectedPacks) == 10, "Expected 10 selected packs")
|
||||||
|
for testPack := range testPacks {
|
||||||
|
_, ok := selectedPacks[testPack]
|
||||||
|
rtest.Assert(t, ok, "Expected input and output to be equal")
|
||||||
|
}
|
||||||
|
}
|
|
@ -238,12 +238,17 @@ integrity of the pack files in the repository, use the ``--read-data`` flag:
|
||||||
repository, beware that it might incur higher bandwidth costs than usual
|
repository, beware that it might incur higher bandwidth costs than usual
|
||||||
and also that it takes more time than the default ``check``.
|
and also that it takes more time than the default ``check``.
|
||||||
|
|
||||||
Alternatively, use the ``--read-data-subset=n/t`` parameter to check only a
|
Alternatively, use the ``--read-data-subset`` parameter to check only a
|
||||||
subset of the repository pack files at a time. The parameter takes two values,
|
subset of the repository pack files at a time. It supports two ways to select a
|
||||||
``n`` and ``t``. When the check command runs, all pack files in the repository
|
subset. One selects a specific range of pack files, the other selects a random
|
||||||
are logically divided in ``t`` (roughly equal) groups, and only files that
|
percentage of pack files.
|
||||||
belong to group number ``n`` are checked. For example, the following commands
|
|
||||||
check all repository pack files over 5 separate invocations:
|
Use ``--read-data-subset=n/t`` to check only a subset of the repository pack
|
||||||
|
files at a time. The parameter takes two values, ``n`` and ``t``. When the check
|
||||||
|
command runs, all pack files in the repository are logically divided in ``t``
|
||||||
|
(roughly equal) groups, and only files that belong to group number ``n`` are
|
||||||
|
checked. For example, the following commands check all repository pack files
|
||||||
|
over 5 separate invocations:
|
||||||
|
|
||||||
.. code-block:: console
|
.. code-block:: console
|
||||||
|
|
||||||
|
@ -252,3 +257,21 @@ check all repository pack files over 5 separate invocations:
|
||||||
$ restic -r /srv/restic-repo check --read-data-subset=3/5
|
$ restic -r /srv/restic-repo check --read-data-subset=3/5
|
||||||
$ restic -r /srv/restic-repo check --read-data-subset=4/5
|
$ restic -r /srv/restic-repo check --read-data-subset=4/5
|
||||||
$ restic -r /srv/restic-repo check --read-data-subset=5/5
|
$ restic -r /srv/restic-repo check --read-data-subset=5/5
|
||||||
|
|
||||||
|
Use ``--read-data-subset=n%`` to check a randomly choosen subset of the
|
||||||
|
repository pack files. It takes one parameter, ``n``, the percentage of pack
|
||||||
|
files to check as an integer or floating point number. This will not guarantee
|
||||||
|
to cover all available pack files after sufficient runs, but it is easy to
|
||||||
|
automate checking a small subset of data after each backup. For a floating point
|
||||||
|
value the following command may be used:
|
||||||
|
|
||||||
|
.. code-block:: console
|
||||||
|
|
||||||
|
$ restic -r /srv/restic-repo check --read-data-subset=2.5%
|
||||||
|
|
||||||
|
When checking bigger subsets you most likely specify the percentage as an
|
||||||
|
integer:
|
||||||
|
|
||||||
|
.. code-block:: console
|
||||||
|
|
||||||
|
$ restic -r /srv/restic-repo check --read-data-subset=10%
|
||||||
|
|
Loading…
Reference in a new issue