Merge pull request #3038 from fgma/check-data-subset-percentage

Check data subset: check random percentage subset
This commit is contained in:
MichaelEischer 2020-11-15 18:24:13 +01:00 committed by GitHub
commit 45ba456291
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 257 additions and 35 deletions

View file

@ -0,0 +1,8 @@
Enhancement: Allow specifying percentage in check's --read-data-subset
We've enhanced the command-line option --read-data-subset to also accept a
percentage. This will check the given percentage of pack files which are
randomly selected on each run.
https://github.com/restic/restic/issues/2186
https://github.com/restic/restic/pull/3038

View file

@ -1,8 +1,8 @@
package main package main
import ( import (
"fmt"
"io/ioutil" "io/ioutil"
"math/rand"
"strconv" "strconv"
"strings" "strings"
@ -53,25 +53,38 @@ func init() {
f := cmdCheck.Flags() f := cmdCheck.Flags()
f.BoolVar(&checkOptions.ReadData, "read-data", false, "read all data blobs") f.BoolVar(&checkOptions.ReadData, "read-data", false, "read all data blobs")
f.StringVar(&checkOptions.ReadDataSubset, "read-data-subset", "", "read subset n of m data packs (format: `n/m`)") f.StringVar(&checkOptions.ReadDataSubset, "read-data-subset", "", "read a `subset` of data packs, specified as 'n/t' for specific subset or either 'x%' or 'x.y%' for random subset")
f.BoolVar(&checkOptions.CheckUnused, "check-unused", false, "find unused blobs") f.BoolVar(&checkOptions.CheckUnused, "check-unused", false, "find unused blobs")
f.BoolVar(&checkOptions.WithCache, "with-cache", false, "use the cache") f.BoolVar(&checkOptions.WithCache, "with-cache", false, "use the cache")
} }
func checkFlags(opts CheckOptions) error { func checkFlags(opts CheckOptions) error {
if opts.ReadData && opts.ReadDataSubset != "" { if opts.ReadData && opts.ReadDataSubset != "" {
return errors.Fatalf("check flags --read-data and --read-data-subset cannot be used together") return errors.Fatal("check flags --read-data and --read-data-subset cannot be used together")
} }
if opts.ReadDataSubset != "" { if opts.ReadDataSubset != "" {
dataSubset, err := stringToIntSlice(opts.ReadDataSubset) dataSubset, err := stringToIntSlice(opts.ReadDataSubset)
if err != nil || len(dataSubset) != 2 { argumentError := errors.Fatal("check flag --read-data-subset must have two positive integer values or a percentage, e.g. --read-data-subset=1/2 or --read-data-subset=2.5%%")
return errors.Fatalf("check flag --read-data-subset must have two positive integer values, e.g. --read-data-subset=1/2") if err == nil {
} if len(dataSubset) != 2 {
if dataSubset[0] == 0 || dataSubset[1] == 0 || dataSubset[0] > dataSubset[1] { return argumentError
return errors.Fatalf("check flag --read-data-subset=n/t values must be positive integers, and n <= t, e.g. --read-data-subset=1/2") }
} if dataSubset[0] == 0 || dataSubset[1] == 0 || dataSubset[0] > dataSubset[1] {
if dataSubset[1] > totalBucketsMax { return errors.Fatal("check flag --read-data-subset=n/t values must be positive integers, and n <= t, e.g. --read-data-subset=1/2")
return errors.Fatalf("check flag --read-data-subset=n/t t must be at most %d", totalBucketsMax) }
if dataSubset[1] > totalBucketsMax {
return errors.Fatalf("check flag --read-data-subset=n/t t must be at most %d", totalBucketsMax)
}
} else {
percentage, err := parsePercentage(opts.ReadDataSubset)
if err != nil {
return argumentError
}
if percentage <= 0.0 || percentage > 100.0 {
return errors.Fatal(
"check flag --read-data-subset=n% n must be above 0.0% and at most 100.0%")
}
} }
} }
@ -98,6 +111,21 @@ func stringToIntSlice(param string) (split []uint, err error) {
return result, nil return result, nil
} }
// ParsePercentage parses a percentage string of the form "X%" where X is a float constant,
// and returns the value of that constant. It does not check the range of the value.
func parsePercentage(s string) (float64, error) {
if !strings.HasSuffix(s, "%") {
return 0, errors.Errorf(`parsePercentage: %q does not end in "%%"`, s)
}
s = s[:len(s)-1]
p, err := strconv.ParseFloat(s, 64)
if err != nil {
return 0, errors.Errorf("parsePercentage: %v", err)
}
return p, nil
}
// prepareCheckCache configures a special cache directory for check. // prepareCheckCache configures a special cache directory for check.
// //
// * if --with-cache is specified, the default cache is used // * if --with-cache is specified, the default cache is used
@ -233,23 +261,9 @@ func runCheck(opts CheckOptions, gopts GlobalOptions, args []string) error {
} }
} }
doReadData := func(bucket, totalBuckets uint) { doReadData := func(packs map[restic.ID]int64) {
packs := make(map[restic.ID]int64)
for pack, size := range chkr.GetPacks() {
// If we ever check more than the first byte
// of pack, update totalBucketsMax.
if (uint(pack[0]) % totalBuckets) == (bucket - 1) {
packs[pack] = size
}
}
packCount := uint64(len(packs)) packCount := uint64(len(packs))
if packCount < chkr.CountPacks() {
Verbosef(fmt.Sprintf("read group #%d of %d data packs (out of total %d packs in %d groups)\n", bucket, packCount, chkr.CountPacks(), totalBuckets))
} else {
Verbosef("read all data\n")
}
p := newProgressMax(!gopts.Quiet, packCount, "packs") p := newProgressMax(!gopts.Quiet, packCount, "packs")
errChan := make(chan error) errChan := make(chan error)
@ -264,10 +278,26 @@ func runCheck(opts CheckOptions, gopts GlobalOptions, args []string) error {
switch { switch {
case opts.ReadData: case opts.ReadData:
doReadData(1, 1) Verbosef("read all data\n")
doReadData(selectPacksByBucket(chkr.GetPacks(), 1, 1))
case opts.ReadDataSubset != "": case opts.ReadDataSubset != "":
dataSubset, _ := stringToIntSlice(opts.ReadDataSubset) var packs map[restic.ID]int64
doReadData(dataSubset[0], dataSubset[1]) dataSubset, err := stringToIntSlice(opts.ReadDataSubset)
if err == nil {
bucket := dataSubset[0]
totalBuckets := dataSubset[1]
packs = selectPacksByBucket(chkr.GetPacks(), bucket, totalBuckets)
packCount := uint64(len(packs))
Verbosef("read group #%d of %d data packs (out of total %d packs in %d groups)\n", bucket, packCount, chkr.CountPacks(), totalBuckets)
} else {
percentage, _ := parsePercentage(opts.ReadDataSubset)
packs = selectRandomPacksByPercentage(chkr.GetPacks(), percentage)
Verbosef("read %.1f%% of data packs\n", percentage)
}
if packs == nil {
return errors.Fatal("internal error: failed to select packs to check")
}
doReadData(packs)
} }
if errorsFound { if errorsFound {
@ -278,3 +308,40 @@ func runCheck(opts CheckOptions, gopts GlobalOptions, args []string) error {
return nil return nil
} }
// selectPacksByBucket selects subsets of packs by ranges of buckets.
func selectPacksByBucket(allPacks map[restic.ID]int64, bucket, totalBuckets uint) map[restic.ID]int64 {
packs := make(map[restic.ID]int64)
for pack, size := range allPacks {
// If we ever check more than the first byte
// of pack, update totalBucketsMax.
if (uint(pack[0]) % totalBuckets) == (bucket - 1) {
packs[pack] = size
}
}
return packs
}
// selectRandomPacksByPercentage selects the given percentage of packs which are randomly choosen.
func selectRandomPacksByPercentage(allPacks map[restic.ID]int64, percentage float64) map[restic.ID]int64 {
packCount := len(allPacks)
packsToCheck := int(float64(packCount) * (percentage / 100.0))
if packsToCheck < 1 {
packsToCheck = 1
}
idx := rand.Perm(packCount)
var keys []restic.ID
for k := range allPacks {
keys = append(keys, k)
}
packs := make(map[restic.ID]int64)
for i := 0; i < packsToCheck; i++ {
id := keys[idx[i]]
packs[id] = allPacks[id]
}
return packs
}

View file

@ -0,0 +1,124 @@
package main
import (
"math"
"reflect"
"testing"
"github.com/restic/restic/internal/restic"
rtest "github.com/restic/restic/internal/test"
)
func TestParsePercentage(t *testing.T) {
testCases := []struct {
input string
output float64
expectError bool
}{
{"0%", 0.0, false},
{"1%", 1.0, false},
{"100%", 100.0, false},
{"123%", 123.0, false},
{"123.456%", 123.456, false},
{"0.742%", 0.742, false},
{"-100%", -100.0, false},
{" 1%", 0.0, true},
{"1 %", 0.0, true},
{"1% ", 0.0, true},
}
for _, testCase := range testCases {
output, err := parsePercentage(testCase.input)
if testCase.expectError {
rtest.Assert(t, err != nil, "Expected error for case %s", testCase.input)
rtest.Assert(t, output == 0.0, "Expected output to be 0.0, got %s", output)
} else {
rtest.Assert(t, err == nil, "Expected no error for case %s", testCase.input)
rtest.Assert(t, math.Abs(testCase.output-output) < 0.00001, "Expected %f, got %f",
testCase.output, output)
}
}
}
func TestStringToIntSlice(t *testing.T) {
testCases := []struct {
input string
output []uint
expectError bool
}{
{"3/5", []uint{3, 5}, false},
{"1/100", []uint{1, 100}, false},
{"abc", nil, true},
{"1/a", nil, true},
{"/", nil, true},
}
for _, testCase := range testCases {
output, err := stringToIntSlice(testCase.input)
if testCase.expectError {
rtest.Assert(t, err != nil, "Expected error for case %s", testCase.input)
rtest.Assert(t, output == nil, "Expected output to be nil, got %s", output)
} else {
rtest.Assert(t, err == nil, "Expected no error for case %s", testCase.input)
rtest.Assert(t, len(output) == 2, "Invalid output length for case %s", testCase.input)
rtest.Assert(t, reflect.DeepEqual(output, testCase.output), "Expected %f, got %f",
testCase.output, output)
}
}
}
func TestSelectPacksByBucket(t *testing.T) {
var testPacks = make(map[restic.ID]int64)
for i := 1; i <= 10; i++ {
id := restic.NewRandomID()
// ensure relevant part of generated id is reproducable
id[0] = byte(i)
testPacks[id] = 0
}
selectedPacks := selectPacksByBucket(testPacks, 0, 10)
rtest.Assert(t, len(selectedPacks) == 0, "Expected 0 selected packs")
for i := uint(1); i <= 5; i++ {
selectedPacks = selectPacksByBucket(testPacks, i, 5)
rtest.Assert(t, len(selectedPacks) == 2, "Expected 2 selected packs")
}
selectedPacks = selectPacksByBucket(testPacks, 1, 1)
rtest.Assert(t, len(selectedPacks) == 10, "Expected 10 selected packs")
for testPack := range testPacks {
_, ok := selectedPacks[testPack]
rtest.Assert(t, ok, "Expected input and output to be equal")
}
}
func TestSelectRandomPacksByPercentage(t *testing.T) {
var testPacks = make(map[restic.ID]int64)
for i := 1; i <= 10; i++ {
testPacks[restic.NewRandomID()] = 0
}
selectedPacks := selectRandomPacksByPercentage(testPacks, 0.0)
rtest.Assert(t, len(selectedPacks) == 1, "Expected 1 selected packs")
selectedPacks = selectRandomPacksByPercentage(testPacks, 10.0)
rtest.Assert(t, len(selectedPacks) == 1, "Expected 1 selected pack")
for pack := range selectedPacks {
_, ok := testPacks[pack]
rtest.Assert(t, ok, "Unexpected selection")
}
selectedPacks = selectRandomPacksByPercentage(testPacks, 50.0)
rtest.Assert(t, len(selectedPacks) == 5, "Expected 5 selected packs")
for pack := range selectedPacks {
_, ok := testPacks[pack]
rtest.Assert(t, ok, "Unexpected item in selection")
}
selectedPacks = selectRandomPacksByPercentage(testPacks, 100.0)
rtest.Assert(t, len(selectedPacks) == 10, "Expected 10 selected packs")
for testPack := range testPacks {
_, ok := selectedPacks[testPack]
rtest.Assert(t, ok, "Expected input and output to be equal")
}
}

View file

@ -238,12 +238,17 @@ integrity of the pack files in the repository, use the ``--read-data`` flag:
repository, beware that it might incur higher bandwidth costs than usual repository, beware that it might incur higher bandwidth costs than usual
and also that it takes more time than the default ``check``. and also that it takes more time than the default ``check``.
Alternatively, use the ``--read-data-subset=n/t`` parameter to check only a Alternatively, use the ``--read-data-subset`` parameter to check only a
subset of the repository pack files at a time. The parameter takes two values, subset of the repository pack files at a time. It supports two ways to select a
``n`` and ``t``. When the check command runs, all pack files in the repository subset. One selects a specific range of pack files, the other selects a random
are logically divided in ``t`` (roughly equal) groups, and only files that percentage of pack files.
belong to group number ``n`` are checked. For example, the following commands
check all repository pack files over 5 separate invocations: Use ``--read-data-subset=n/t`` to check only a subset of the repository pack
files at a time. The parameter takes two values, ``n`` and ``t``. When the check
command runs, all pack files in the repository are logically divided in ``t``
(roughly equal) groups, and only files that belong to group number ``n`` are
checked. For example, the following commands check all repository pack files
over 5 separate invocations:
.. code-block:: console .. code-block:: console
@ -252,3 +257,21 @@ check all repository pack files over 5 separate invocations:
$ restic -r /srv/restic-repo check --read-data-subset=3/5 $ restic -r /srv/restic-repo check --read-data-subset=3/5
$ restic -r /srv/restic-repo check --read-data-subset=4/5 $ restic -r /srv/restic-repo check --read-data-subset=4/5
$ restic -r /srv/restic-repo check --read-data-subset=5/5 $ restic -r /srv/restic-repo check --read-data-subset=5/5
Use ``--read-data-subset=n%`` to check a randomly choosen subset of the
repository pack files. It takes one parameter, ``n``, the percentage of pack
files to check as an integer or floating point number. This will not guarantee
to cover all available pack files after sufficient runs, but it is easy to
automate checking a small subset of data after each backup. For a floating point
value the following command may be used:
.. code-block:: console
$ restic -r /srv/restic-repo check --read-data-subset=2.5%
When checking bigger subsets you most likely specify the percentage as an
integer:
.. code-block:: console
$ restic -r /srv/restic-repo check --read-data-subset=10%