Check data subset: check random percentage subset

This commit is contained in:
fgma 2020-10-24 17:30:42 +02:00 committed by Michael Eischer
parent 3c0c0c132b
commit 8f9cea8cc0
4 changed files with 257 additions and 35 deletions

View file

@ -0,0 +1,8 @@
Enhancement: Allow specifying percentage in check's --read-data-subset
We've enhanced the command-line option --read-data-subset to also accept a
percentage. This will check the given percentage of pack files which are
randomly selected on each run.
https://github.com/restic/restic/pull/3038
https://github.com/restic/restic/issues/2186

View file

@ -1,8 +1,8 @@
package main
import (
"fmt"
"io/ioutil"
"math/rand"
"strconv"
"strings"
@ -53,25 +53,38 @@ func init() {
f := cmdCheck.Flags()
f.BoolVar(&checkOptions.ReadData, "read-data", false, "read all data blobs")
f.StringVar(&checkOptions.ReadDataSubset, "read-data-subset", "", "read subset n of m data packs (format: `n/m`)")
f.StringVar(&checkOptions.ReadDataSubset, "read-data-subset", "", "read a `subset` of data packs, specified as 'n/t' for specific subset or either 'x%' or 'x.y%' for random subset")
f.BoolVar(&checkOptions.CheckUnused, "check-unused", false, "find unused blobs")
f.BoolVar(&checkOptions.WithCache, "with-cache", false, "use the cache")
}
func checkFlags(opts CheckOptions) error {
if opts.ReadData && opts.ReadDataSubset != "" {
return errors.Fatalf("check flags --read-data and --read-data-subset cannot be used together")
return errors.Fatal("check flags --read-data and --read-data-subset cannot be used together")
}
if opts.ReadDataSubset != "" {
dataSubset, err := stringToIntSlice(opts.ReadDataSubset)
if err != nil || len(dataSubset) != 2 {
return errors.Fatalf("check flag --read-data-subset must have two positive integer values, e.g. --read-data-subset=1/2")
}
if dataSubset[0] == 0 || dataSubset[1] == 0 || dataSubset[0] > dataSubset[1] {
return errors.Fatalf("check flag --read-data-subset=n/t values must be positive integers, and n <= t, e.g. --read-data-subset=1/2")
}
if dataSubset[1] > totalBucketsMax {
return errors.Fatalf("check flag --read-data-subset=n/t t must be at most %d", totalBucketsMax)
argumentError := errors.Fatal("check flag --read-data-subset must have two positive integer values or a percentage, e.g. --read-data-subset=1/2 or --read-data-subset=2.5%%")
if err == nil {
if len(dataSubset) != 2 {
return argumentError
}
if dataSubset[0] == 0 || dataSubset[1] == 0 || dataSubset[0] > dataSubset[1] {
return errors.Fatal("check flag --read-data-subset=n/t values must be positive integers, and n <= t, e.g. --read-data-subset=1/2")
}
if dataSubset[1] > totalBucketsMax {
return errors.Fatalf("check flag --read-data-subset=n/t t must be at most %d", totalBucketsMax)
}
} else {
percentage, err := parsePercentage(opts.ReadDataSubset)
if err != nil {
return argumentError
}
if percentage <= 0.0 || percentage > 100.0 {
return errors.Fatal(
"check flag --read-data-subset=n% n must be above 0.0% and 100.0%")
}
}
}
@ -98,6 +111,21 @@ func stringToIntSlice(param string) (split []uint, err error) {
return result, nil
}
// ParsePercentage parses a percentage string of the form "X%" where X is a float constant,
// and returns the value of that constant. It does not check the range of the value.
func parsePercentage(s string) (float64, error) {
if !strings.HasSuffix(s, "%") {
return 0, errors.Errorf(`parsePercentage: %q does not end in "%%"`, s)
}
s = s[:len(s)-1]
p, err := strconv.ParseFloat(s, 64)
if err != nil {
return 0, errors.Errorf("parsePercentage: %v", err)
}
return p, nil
}
// prepareCheckCache configures a special cache directory for check.
//
// * if --with-cache is specified, the default cache is used
@ -233,23 +261,9 @@ func runCheck(opts CheckOptions, gopts GlobalOptions, args []string) error {
}
}
doReadData := func(bucket, totalBuckets uint) {
packs := make(map[restic.ID]int64)
for pack, size := range chkr.GetPacks() {
// If we ever check more than the first byte
// of pack, update totalBucketsMax.
if (uint(pack[0]) % totalBuckets) == (bucket - 1) {
packs[pack] = size
}
}
doReadData := func(packs map[restic.ID]int64) {
packCount := uint64(len(packs))
if packCount < chkr.CountPacks() {
Verbosef(fmt.Sprintf("read group #%d of %d data packs (out of total %d packs in %d groups)\n", bucket, packCount, chkr.CountPacks(), totalBuckets))
} else {
Verbosef("read all data\n")
}
p := newProgressMax(!gopts.Quiet, packCount, "packs")
errChan := make(chan error)
@ -264,10 +278,26 @@ func runCheck(opts CheckOptions, gopts GlobalOptions, args []string) error {
switch {
case opts.ReadData:
doReadData(1, 1)
Verbosef("read all data\n")
doReadData(selectPacksByBucket(chkr.GetPacks(), 1, 1))
case opts.ReadDataSubset != "":
dataSubset, _ := stringToIntSlice(opts.ReadDataSubset)
doReadData(dataSubset[0], dataSubset[1])
var packs map[restic.ID]int64
dataSubset, err := stringToIntSlice(opts.ReadDataSubset)
if err == nil {
bucket := dataSubset[0]
totalBuckets := dataSubset[1]
packs = selectPacksByBucket(chkr.GetPacks(), bucket, totalBuckets)
packCount := uint64(len(packs))
Verbosef("read group #%d of %d data packs (out of total %d packs in %d groups)\n", bucket, packCount, chkr.CountPacks(), totalBuckets)
} else {
percentage, _ := parsePercentage(opts.ReadDataSubset)
packs = selectRandomPacksByPercentage(chkr.GetPacks(), percentage)
Verbosef("read %.1f%% of data packs\n", percentage)
}
if packs == nil {
return errors.Fatal("internal error: failed to select packs to check")
}
doReadData(packs)
}
if errorsFound {
@ -278,3 +308,40 @@ func runCheck(opts CheckOptions, gopts GlobalOptions, args []string) error {
return nil
}
// selectPacksByBucket selects subsets of packs by ranges of buckets.
func selectPacksByBucket(allPacks map[restic.ID]int64, bucket, totalBuckets uint) map[restic.ID]int64 {
packs := make(map[restic.ID]int64)
for pack, size := range allPacks {
// If we ever check more than the first byte
// of pack, update totalBucketsMax.
if (uint(pack[0]) % totalBuckets) == (bucket - 1) {
packs[pack] = size
}
}
return packs
}
// selectRandomPacksByPercentage selects the given percentage of packs which are randomly choosen.
func selectRandomPacksByPercentage(allPacks map[restic.ID]int64, percentage float64) map[restic.ID]int64 {
packCount := len(allPacks)
packsToCheck := int(float64(packCount) * (percentage / 100.0))
if packsToCheck < 1 {
packsToCheck = 1
}
idx := rand.Perm(packCount)
var keys []restic.ID
for k := range allPacks {
keys = append(keys, k)
}
packs := make(map[restic.ID]int64)
for i := 0; i < packsToCheck; i++ {
id := keys[idx[i]]
packs[id] = allPacks[id]
}
return packs
}

View file

@ -0,0 +1,124 @@
package main
import (
"math"
"reflect"
"testing"
"github.com/restic/restic/internal/restic"
rtest "github.com/restic/restic/internal/test"
)
func TestParsePercentage(t *testing.T) {
testCases := []struct {
input string
output float64
expectError bool
}{
{"0%", 0.0, false},
{"1%", 1.0, false},
{"100%", 100.0, false},
{"123%", 123.0, false},
{"123.456%", 123.456, false},
{"0.742%", 0.742, false},
{"-100%", -100.0, false},
{" 1%", 0.0, true},
{"1 %", 0.0, true},
{"1% ", 0.0, true},
}
for _, testCase := range testCases {
output, err := parsePercentage(testCase.input)
if testCase.expectError {
rtest.Assert(t, err != nil, "Expected error for case %s", testCase.input)
rtest.Assert(t, output == 0.0, "Expected output to be 0.0, got %s", output)
} else {
rtest.Assert(t, err == nil, "Expected no error for case %s", testCase.input)
rtest.Assert(t, math.Abs(testCase.output-output) < 0.00001, "Expected %f, got %f",
testCase.output, output)
}
}
}
func TestStringToIntSlice(t *testing.T) {
testCases := []struct {
input string
output []uint
expectError bool
}{
{"3/5", []uint{3, 5}, false},
{"1/100", []uint{1, 100}, false},
{"abc", nil, true},
{"1/a", nil, true},
{"/", nil, true},
}
for _, testCase := range testCases {
output, err := stringToIntSlice(testCase.input)
if testCase.expectError {
rtest.Assert(t, err != nil, "Expected error for case %s", testCase.input)
rtest.Assert(t, output == nil, "Expected output to be nil, got %s", output)
} else {
rtest.Assert(t, err == nil, "Expected no error for case %s", testCase.input)
rtest.Assert(t, len(output) == 2, "Invalid output length for case %s", testCase.input)
rtest.Assert(t, reflect.DeepEqual(output, testCase.output), "Expected %f, got %f",
testCase.output, output)
}
}
}
func TestSelectPacksByBucket(t *testing.T) {
var testPacks = make(map[restic.ID]int64)
for i := 1; i <= 10; i++ {
id := restic.NewRandomID()
// ensure relevant part of generated id is reproducable
id[0] = byte(i)
testPacks[id] = 0
}
selectedPacks := selectPacksByBucket(testPacks, 0, 10)
rtest.Assert(t, len(selectedPacks) == 0, "Expected 0 selected packs")
for i := uint(1); i <= 5; i++ {
selectedPacks = selectPacksByBucket(testPacks, i, 5)
rtest.Assert(t, len(selectedPacks) == 2, "Expected 2 selected packs")
}
selectedPacks = selectPacksByBucket(testPacks, 1, 1)
rtest.Assert(t, len(selectedPacks) == 10, "Expected 10 selected packs")
for testPack := range testPacks {
_, ok := selectedPacks[testPack]
rtest.Assert(t, ok, "Expected input and output to be equal")
}
}
func TestSelectRandomPacksByPercentage(t *testing.T) {
var testPacks = make(map[restic.ID]int64)
for i := 1; i <= 10; i++ {
testPacks[restic.NewRandomID()] = 0
}
selectedPacks := selectRandomPacksByPercentage(testPacks, 0.0)
rtest.Assert(t, len(selectedPacks) == 1, "Expected 1 selected packs")
selectedPacks = selectRandomPacksByPercentage(testPacks, 10.0)
rtest.Assert(t, len(selectedPacks) == 1, "Expected 1 selected pack")
for pack := range selectedPacks {
_, ok := testPacks[pack]
rtest.Assert(t, ok, "Unexpected selection")
}
selectedPacks = selectRandomPacksByPercentage(testPacks, 50.0)
rtest.Assert(t, len(selectedPacks) == 5, "Expected 5 selected packs")
for pack := range selectedPacks {
_, ok := testPacks[pack]
rtest.Assert(t, ok, "Unexpected item in selection")
}
selectedPacks = selectRandomPacksByPercentage(testPacks, 100.0)
rtest.Assert(t, len(selectedPacks) == 10, "Expected 10 selected packs")
for testPack := range testPacks {
_, ok := selectedPacks[testPack]
rtest.Assert(t, ok, "Expected input and output to be equal")
}
}

View file

@ -238,12 +238,17 @@ integrity of the pack files in the repository, use the ``--read-data`` flag:
repository, beware that it might incur higher bandwidth costs than usual
and also that it takes more time than the default ``check``.
Alternatively, use the ``--read-data-subset=n/t`` parameter to check only a
subset of the repository pack files at a time. The parameter takes two values,
``n`` and ``t``. When the check command runs, all pack files in the repository
are logically divided in ``t`` (roughly equal) groups, and only files that
belong to group number ``n`` are checked. For example, the following commands
check all repository pack files over 5 separate invocations:
Alternatively, use the ``--read-data-subset`` parameter to check only a
subset of the repository pack files at a time. It supports two ways to select a
subset. One selects a specific range of pack files, the other selects a random
percentage of pack files.
Use ``--read-data-subset=n/t`` to check only a subset of the repository pack
files at a time. The parameter takes two values, ``n`` and ``t``. When the check
command runs, all pack files in the repository are logically divided in ``t``
(roughly equal) groups, and only files that belong to group number ``n`` are
checked. For example, the following commands check all repository pack files
over 5 separate invocations:
.. code-block:: console
@ -252,3 +257,21 @@ check all repository pack files over 5 separate invocations:
$ restic -r /srv/restic-repo check --read-data-subset=3/5
$ restic -r /srv/restic-repo check --read-data-subset=4/5
$ restic -r /srv/restic-repo check --read-data-subset=5/5
Use ``--read-data-subset=n%`` to check a randomly choosen subset of the
repository pack files. It takes one parameter, ``n``, the percentage of pack
files to check as an integer or floating point number. This will not guarantee
to cover all available pack files after sufficient runs, but it is easy to
automate checking a small subset of data after each backup. For a floating point
value the following command may be used:
.. code-block:: console
$ restic -r /srv/restic-repo check --read-data-subset=2.5%
When checking bigger subsets you most likely specify the percentage as an
integer:
.. code-block:: console
$ restic -r /srv/restic-repo check --read-data-subset=10%