From 8f9cea8cc0e4523e4b74dd4988c4081f908f5cad Mon Sep 17 00:00:00 2001 From: fgma Date: Sat, 24 Oct 2020 17:30:42 +0200 Subject: [PATCH 1/3] Check data subset: check random percentage subset --- changelog/unreleased/pull-3038 | 8 +++ cmd/restic/cmd_check.go | 125 +++++++++++++++++++++++++-------- cmd/restic/cmd_check_test.go | 124 ++++++++++++++++++++++++++++++++ doc/045_working_with_repos.rst | 35 +++++++-- 4 files changed, 257 insertions(+), 35 deletions(-) create mode 100644 changelog/unreleased/pull-3038 create mode 100644 cmd/restic/cmd_check_test.go diff --git a/changelog/unreleased/pull-3038 b/changelog/unreleased/pull-3038 new file mode 100644 index 000000000..496df276b --- /dev/null +++ b/changelog/unreleased/pull-3038 @@ -0,0 +1,8 @@ +Enhancement: Allow specifying percentage in check's --read-data-subset + +We've enhanced the command-line option --read-data-subset to also accept a +percentage. This will check the given percentage of pack files which are +randomly selected on each run. + +https://github.com/restic/restic/pull/3038 +https://github.com/restic/restic/issues/2186 diff --git a/cmd/restic/cmd_check.go b/cmd/restic/cmd_check.go index 618b98fe2..f261a45ca 100644 --- a/cmd/restic/cmd_check.go +++ b/cmd/restic/cmd_check.go @@ -1,8 +1,8 @@ package main import ( - "fmt" "io/ioutil" + "math/rand" "strconv" "strings" @@ -53,25 +53,38 @@ func init() { f := cmdCheck.Flags() f.BoolVar(&checkOptions.ReadData, "read-data", false, "read all data blobs") - f.StringVar(&checkOptions.ReadDataSubset, "read-data-subset", "", "read subset n of m data packs (format: `n/m`)") + f.StringVar(&checkOptions.ReadDataSubset, "read-data-subset", "", "read a `subset` of data packs, specified as 'n/t' for specific subset or either 'x%' or 'x.y%' for random subset") f.BoolVar(&checkOptions.CheckUnused, "check-unused", false, "find unused blobs") f.BoolVar(&checkOptions.WithCache, "with-cache", false, "use the cache") } func checkFlags(opts CheckOptions) error { if opts.ReadData && opts.ReadDataSubset != "" { - return errors.Fatalf("check flags --read-data and --read-data-subset cannot be used together") + return errors.Fatal("check flags --read-data and --read-data-subset cannot be used together") } if opts.ReadDataSubset != "" { dataSubset, err := stringToIntSlice(opts.ReadDataSubset) - if err != nil || len(dataSubset) != 2 { - return errors.Fatalf("check flag --read-data-subset must have two positive integer values, e.g. --read-data-subset=1/2") - } - if dataSubset[0] == 0 || dataSubset[1] == 0 || dataSubset[0] > dataSubset[1] { - return errors.Fatalf("check flag --read-data-subset=n/t values must be positive integers, and n <= t, e.g. --read-data-subset=1/2") - } - if dataSubset[1] > totalBucketsMax { - return errors.Fatalf("check flag --read-data-subset=n/t t must be at most %d", totalBucketsMax) + argumentError := errors.Fatal("check flag --read-data-subset must have two positive integer values or a percentage, e.g. --read-data-subset=1/2 or --read-data-subset=2.5%%") + if err == nil { + if len(dataSubset) != 2 { + return argumentError + } + if dataSubset[0] == 0 || dataSubset[1] == 0 || dataSubset[0] > dataSubset[1] { + return errors.Fatal("check flag --read-data-subset=n/t values must be positive integers, and n <= t, e.g. --read-data-subset=1/2") + } + if dataSubset[1] > totalBucketsMax { + return errors.Fatalf("check flag --read-data-subset=n/t t must be at most %d", totalBucketsMax) + } + } else { + percentage, err := parsePercentage(opts.ReadDataSubset) + if err != nil { + return argumentError + } + + if percentage <= 0.0 || percentage > 100.0 { + return errors.Fatal( + "check flag --read-data-subset=n% n must be above 0.0% and 100.0%") + } } } @@ -98,6 +111,21 @@ func stringToIntSlice(param string) (split []uint, err error) { return result, nil } +// ParsePercentage parses a percentage string of the form "X%" where X is a float constant, +// and returns the value of that constant. It does not check the range of the value. +func parsePercentage(s string) (float64, error) { + if !strings.HasSuffix(s, "%") { + return 0, errors.Errorf(`parsePercentage: %q does not end in "%%"`, s) + } + s = s[:len(s)-1] + + p, err := strconv.ParseFloat(s, 64) + if err != nil { + return 0, errors.Errorf("parsePercentage: %v", err) + } + return p, nil +} + // prepareCheckCache configures a special cache directory for check. // // * if --with-cache is specified, the default cache is used @@ -233,23 +261,9 @@ func runCheck(opts CheckOptions, gopts GlobalOptions, args []string) error { } } - doReadData := func(bucket, totalBuckets uint) { - packs := make(map[restic.ID]int64) - for pack, size := range chkr.GetPacks() { - // If we ever check more than the first byte - // of pack, update totalBucketsMax. - if (uint(pack[0]) % totalBuckets) == (bucket - 1) { - packs[pack] = size - } - } + doReadData := func(packs map[restic.ID]int64) { packCount := uint64(len(packs)) - if packCount < chkr.CountPacks() { - Verbosef(fmt.Sprintf("read group #%d of %d data packs (out of total %d packs in %d groups)\n", bucket, packCount, chkr.CountPacks(), totalBuckets)) - } else { - Verbosef("read all data\n") - } - p := newProgressMax(!gopts.Quiet, packCount, "packs") errChan := make(chan error) @@ -264,10 +278,26 @@ func runCheck(opts CheckOptions, gopts GlobalOptions, args []string) error { switch { case opts.ReadData: - doReadData(1, 1) + Verbosef("read all data\n") + doReadData(selectPacksByBucket(chkr.GetPacks(), 1, 1)) case opts.ReadDataSubset != "": - dataSubset, _ := stringToIntSlice(opts.ReadDataSubset) - doReadData(dataSubset[0], dataSubset[1]) + var packs map[restic.ID]int64 + dataSubset, err := stringToIntSlice(opts.ReadDataSubset) + if err == nil { + bucket := dataSubset[0] + totalBuckets := dataSubset[1] + packs = selectPacksByBucket(chkr.GetPacks(), bucket, totalBuckets) + packCount := uint64(len(packs)) + Verbosef("read group #%d of %d data packs (out of total %d packs in %d groups)\n", bucket, packCount, chkr.CountPacks(), totalBuckets) + } else { + percentage, _ := parsePercentage(opts.ReadDataSubset) + packs = selectRandomPacksByPercentage(chkr.GetPacks(), percentage) + Verbosef("read %.1f%% of data packs\n", percentage) + } + if packs == nil { + return errors.Fatal("internal error: failed to select packs to check") + } + doReadData(packs) } if errorsFound { @@ -278,3 +308,40 @@ func runCheck(opts CheckOptions, gopts GlobalOptions, args []string) error { return nil } + +// selectPacksByBucket selects subsets of packs by ranges of buckets. +func selectPacksByBucket(allPacks map[restic.ID]int64, bucket, totalBuckets uint) map[restic.ID]int64 { + packs := make(map[restic.ID]int64) + for pack, size := range allPacks { + // If we ever check more than the first byte + // of pack, update totalBucketsMax. + if (uint(pack[0]) % totalBuckets) == (bucket - 1) { + packs[pack] = size + } + } + return packs +} + +// selectRandomPacksByPercentage selects the given percentage of packs which are randomly choosen. +func selectRandomPacksByPercentage(allPacks map[restic.ID]int64, percentage float64) map[restic.ID]int64 { + packCount := len(allPacks) + packsToCheck := int(float64(packCount) * (percentage / 100.0)) + if packsToCheck < 1 { + packsToCheck = 1 + } + idx := rand.Perm(packCount) + + var keys []restic.ID + for k := range allPacks { + keys = append(keys, k) + } + + packs := make(map[restic.ID]int64) + + for i := 0; i < packsToCheck; i++ { + id := keys[idx[i]] + packs[id] = allPacks[id] + } + + return packs +} diff --git a/cmd/restic/cmd_check_test.go b/cmd/restic/cmd_check_test.go new file mode 100644 index 000000000..929771151 --- /dev/null +++ b/cmd/restic/cmd_check_test.go @@ -0,0 +1,124 @@ +package main + +import ( + "math" + "reflect" + "testing" + + "github.com/restic/restic/internal/restic" + rtest "github.com/restic/restic/internal/test" +) + +func TestParsePercentage(t *testing.T) { + testCases := []struct { + input string + output float64 + expectError bool + }{ + {"0%", 0.0, false}, + {"1%", 1.0, false}, + {"100%", 100.0, false}, + {"123%", 123.0, false}, + {"123.456%", 123.456, false}, + {"0.742%", 0.742, false}, + {"-100%", -100.0, false}, + {" 1%", 0.0, true}, + {"1 %", 0.0, true}, + {"1% ", 0.0, true}, + } + for _, testCase := range testCases { + output, err := parsePercentage(testCase.input) + + if testCase.expectError { + rtest.Assert(t, err != nil, "Expected error for case %s", testCase.input) + rtest.Assert(t, output == 0.0, "Expected output to be 0.0, got %s", output) + } else { + rtest.Assert(t, err == nil, "Expected no error for case %s", testCase.input) + rtest.Assert(t, math.Abs(testCase.output-output) < 0.00001, "Expected %f, got %f", + testCase.output, output) + } + } +} + +func TestStringToIntSlice(t *testing.T) { + testCases := []struct { + input string + output []uint + expectError bool + }{ + {"3/5", []uint{3, 5}, false}, + {"1/100", []uint{1, 100}, false}, + {"abc", nil, true}, + {"1/a", nil, true}, + {"/", nil, true}, + } + for _, testCase := range testCases { + output, err := stringToIntSlice(testCase.input) + + if testCase.expectError { + rtest.Assert(t, err != nil, "Expected error for case %s", testCase.input) + rtest.Assert(t, output == nil, "Expected output to be nil, got %s", output) + } else { + rtest.Assert(t, err == nil, "Expected no error for case %s", testCase.input) + rtest.Assert(t, len(output) == 2, "Invalid output length for case %s", testCase.input) + rtest.Assert(t, reflect.DeepEqual(output, testCase.output), "Expected %f, got %f", + testCase.output, output) + } + } +} + +func TestSelectPacksByBucket(t *testing.T) { + var testPacks = make(map[restic.ID]int64) + for i := 1; i <= 10; i++ { + id := restic.NewRandomID() + // ensure relevant part of generated id is reproducable + id[0] = byte(i) + testPacks[id] = 0 + } + + selectedPacks := selectPacksByBucket(testPacks, 0, 10) + rtest.Assert(t, len(selectedPacks) == 0, "Expected 0 selected packs") + + for i := uint(1); i <= 5; i++ { + selectedPacks = selectPacksByBucket(testPacks, i, 5) + rtest.Assert(t, len(selectedPacks) == 2, "Expected 2 selected packs") + } + + selectedPacks = selectPacksByBucket(testPacks, 1, 1) + rtest.Assert(t, len(selectedPacks) == 10, "Expected 10 selected packs") + for testPack := range testPacks { + _, ok := selectedPacks[testPack] + rtest.Assert(t, ok, "Expected input and output to be equal") + } +} + +func TestSelectRandomPacksByPercentage(t *testing.T) { + var testPacks = make(map[restic.ID]int64) + for i := 1; i <= 10; i++ { + testPacks[restic.NewRandomID()] = 0 + } + + selectedPacks := selectRandomPacksByPercentage(testPacks, 0.0) + rtest.Assert(t, len(selectedPacks) == 1, "Expected 1 selected packs") + + selectedPacks = selectRandomPacksByPercentage(testPacks, 10.0) + rtest.Assert(t, len(selectedPacks) == 1, "Expected 1 selected pack") + for pack := range selectedPacks { + _, ok := testPacks[pack] + rtest.Assert(t, ok, "Unexpected selection") + } + + selectedPacks = selectRandomPacksByPercentage(testPacks, 50.0) + rtest.Assert(t, len(selectedPacks) == 5, "Expected 5 selected packs") + for pack := range selectedPacks { + _, ok := testPacks[pack] + rtest.Assert(t, ok, "Unexpected item in selection") + } + + selectedPacks = selectRandomPacksByPercentage(testPacks, 100.0) + rtest.Assert(t, len(selectedPacks) == 10, "Expected 10 selected packs") + for testPack := range testPacks { + _, ok := selectedPacks[testPack] + rtest.Assert(t, ok, "Expected input and output to be equal") + } +} diff --git a/doc/045_working_with_repos.rst b/doc/045_working_with_repos.rst index 8ce9f8815..c40260acc 100644 --- a/doc/045_working_with_repos.rst +++ b/doc/045_working_with_repos.rst @@ -238,12 +238,17 @@ integrity of the pack files in the repository, use the ``--read-data`` flag: repository, beware that it might incur higher bandwidth costs than usual and also that it takes more time than the default ``check``. -Alternatively, use the ``--read-data-subset=n/t`` parameter to check only a -subset of the repository pack files at a time. The parameter takes two values, -``n`` and ``t``. When the check command runs, all pack files in the repository -are logically divided in ``t`` (roughly equal) groups, and only files that -belong to group number ``n`` are checked. For example, the following commands -check all repository pack files over 5 separate invocations: +Alternatively, use the ``--read-data-subset`` parameter to check only a +subset of the repository pack files at a time. It supports two ways to select a +subset. One selects a specific range of pack files, the other selects a random +percentage of pack files. + +Use ``--read-data-subset=n/t`` to check only a subset of the repository pack +files at a time. The parameter takes two values, ``n`` and ``t``. When the check +command runs, all pack files in the repository are logically divided in ``t`` +(roughly equal) groups, and only files that belong to group number ``n`` are +checked. For example, the following commands check all repository pack files +over 5 separate invocations: .. code-block:: console @@ -252,3 +257,21 @@ check all repository pack files over 5 separate invocations: $ restic -r /srv/restic-repo check --read-data-subset=3/5 $ restic -r /srv/restic-repo check --read-data-subset=4/5 $ restic -r /srv/restic-repo check --read-data-subset=5/5 + +Use ``--read-data-subset=n%`` to check a randomly choosen subset of the +repository pack files. It takes one parameter, ``n``, the percentage of pack +files to check as an integer or floating point number. This will not guarantee +to cover all available pack files after sufficient runs, but it is easy to +automate checking a small subset of data after each backup. For a floating point +value the following command may be used: + +.. code-block:: console + + $ restic -r /srv/restic-repo check --read-data-subset=2.5% + +When checking bigger subsets you most likely specify the percentage as an +integer: + +.. code-block:: console + + $ restic -r /srv/restic-repo check --read-data-subset=10% From 15c537f9db0ac322b806e6d7b44a9155c74f07c6 Mon Sep 17 00:00:00 2001 From: Michael Eischer Date: Sun, 15 Nov 2020 18:11:39 +0100 Subject: [PATCH 2/3] Rename changelog entry to contain issue id --- changelog/unreleased/{pull-3038 => issue-2186} | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) rename changelog/unreleased/{pull-3038 => issue-2186} (100%) diff --git a/changelog/unreleased/pull-3038 b/changelog/unreleased/issue-2186 similarity index 100% rename from changelog/unreleased/pull-3038 rename to changelog/unreleased/issue-2186 index 496df276b..87edf9506 100644 --- a/changelog/unreleased/pull-3038 +++ b/changelog/unreleased/issue-2186 @@ -4,5 +4,5 @@ We've enhanced the command-line option --read-data-subset to also accept a percentage. This will check the given percentage of pack files which are randomly selected on each run. -https://github.com/restic/restic/pull/3038 https://github.com/restic/restic/issues/2186 +https://github.com/restic/restic/pull/3038 From caac38ed27bab3ed18f0cecb896fdcfeebc17fdd Mon Sep 17 00:00:00 2001 From: Michael Eischer Date: Sun, 15 Nov 2020 18:12:36 +0100 Subject: [PATCH 3/3] check: Tweak subset percentage over 100% error message --- cmd/restic/cmd_check.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmd/restic/cmd_check.go b/cmd/restic/cmd_check.go index f261a45ca..529b2d71d 100644 --- a/cmd/restic/cmd_check.go +++ b/cmd/restic/cmd_check.go @@ -83,7 +83,7 @@ func checkFlags(opts CheckOptions) error { if percentage <= 0.0 || percentage > 100.0 { return errors.Fatal( - "check flag --read-data-subset=n% n must be above 0.0% and 100.0%") + "check flag --read-data-subset=n% n must be above 0.0% and at most 100.0%") } } }