Merge pull request #3522 from greatroar/dump-lru

Use LRU cache in restic dump
This commit is contained in:
MichaelEischer 2021-09-24 20:33:58 +02:00 committed by GitHub
commit a5e103a212
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
10 changed files with 141 additions and 92 deletions

View file

@ -0,0 +1,10 @@
Enhancement: Cache blobs read by the dump command
When dumping a file using the `restic dump` command, restic did not cache blobs
in any way, so even consecutive runs of the same blob did get loaded from the
repository again and again, slowing down the dump.
Now, the caching mechanism already used by the `fuse` command is also used by
the `dump` command. This makes dumping much faster, especially for sparse files.
https://github.com/restic/restic/pull/3508

View file

@ -95,7 +95,8 @@ func printFromTree(ctx context.Context, tree *restic.Tree, repo restic.Repositor
if node.Name == pathComponents[0] {
switch {
case l == 1 && dump.IsFile(node):
return dump.GetNodeData(ctx, os.Stdout, repo, node)
cache := dump.NewCache()
return dump.WriteNodeData(ctx, os.Stdout, repo, node, cache)
case l > 1 && dump.IsDir(node):
subtree, err := repo.LoadTree(ctx, *node.Subtree)
if err != nil {

View file

@ -1,4 +1,4 @@
package fuse
package bloblru
import (
"sync"
@ -10,12 +10,12 @@ import (
)
// Crude estimate of the overhead per blob: a SHA-256, a linked list node
// and some pointers. See comment in blobCache.add.
const cacheOverhead = len(restic.ID{}) + 64
// and some pointers. See comment in Cache.add.
const overhead = len(restic.ID{}) + 64
// A blobCache is a fixed-size cache of blob contents.
// A Cache is a fixed-size LRU cache of blob contents.
// It is safe for concurrent access.
type blobCache struct {
type Cache struct {
mu sync.Mutex
c *simplelru.LRU
@ -23,16 +23,16 @@ type blobCache struct {
}
// Construct a blob cache that stores at most size bytes worth of blobs.
func newBlobCache(size int) *blobCache {
c := &blobCache{
func New(size int) *Cache {
c := &Cache{
free: size,
size: size,
}
// NewLRU wants us to specify some max. number of entries, else it errors.
// The actual maximum will be smaller than size/cacheOverhead, because we
// The actual maximum will be smaller than size/overhead, because we
// evict entries (RemoveOldest in add) to maintain our size bound.
maxEntries := size / cacheOverhead
maxEntries := size / overhead
lru, err := simplelru.NewLRU(maxEntries, c.evict)
if err != nil {
panic(err) // Can only be maxEntries <= 0.
@ -42,10 +42,12 @@ func newBlobCache(size int) *blobCache {
return c
}
func (c *blobCache) add(id restic.ID, blob []byte) {
debug.Log("blobCache: add %v", id)
// Add adds key id with value blob to c.
// It may return an evicted buffer for reuse.
func (c *Cache) Add(id restic.ID, blob []byte) (old []byte) {
debug.Log("bloblru.Cache: add %v", id)
size := len(blob) + cacheOverhead
size := len(blob) + overhead
if size > c.size {
return
}
@ -59,29 +61,36 @@ func (c *blobCache) add(id restic.ID, blob []byte) {
return
}
// This loop takes at most min(maxEntries, maxchunksize/cacheOverhead)
// This loop takes at most min(maxEntries, maxchunksize/overhead)
// iterations.
for size > c.free {
c.c.RemoveOldest()
_, val, _ := c.c.RemoveOldest()
b := val.([]byte)
if len(b) > len(old) {
// We can only return one buffer, so pick the largest.
old = b
}
}
c.c.Add(key, blob)
c.free -= size
return old
}
func (c *blobCache) get(id restic.ID) ([]byte, bool) {
func (c *Cache) Get(id restic.ID) ([]byte, bool) {
c.mu.Lock()
value, ok := c.c.Get(id)
c.mu.Unlock()
debug.Log("blobCache: get %v, hit %v", id, ok)
debug.Log("bloblru.Cache: get %v, hit %v", id, ok)
blob, ok := value.([]byte)
return blob, ok
}
func (c *blobCache) evict(key, value interface{}) {
func (c *Cache) evict(key, value interface{}) {
blob := value.([]byte)
debug.Log("blobCache: evict %v, %d bytes", key, len(blob))
c.free += len(blob) + cacheOverhead
debug.Log("bloblru.Cache: evict %v, %d bytes", key, len(blob))
c.free += len(blob) + overhead
}

View file

@ -0,0 +1,50 @@
package bloblru
import (
"testing"
"github.com/restic/restic/internal/restic"
rtest "github.com/restic/restic/internal/test"
)
func TestCache(t *testing.T) {
var id1, id2, id3 restic.ID
id1[0] = 1
id2[0] = 2
id3[0] = 3
const (
kiB = 1 << 10
cacheSize = 64*kiB + 3*overhead
)
c := New(cacheSize)
addAndCheck := func(id restic.ID, exp []byte) {
c.Add(id, exp)
blob, ok := c.Get(id)
rtest.Assert(t, ok, "blob %v added but not found in cache", id)
rtest.Equals(t, &exp[0], &blob[0])
rtest.Equals(t, exp, blob)
}
addAndCheck(id1, make([]byte, 32*kiB))
addAndCheck(id2, make([]byte, 30*kiB))
addAndCheck(id3, make([]byte, 10*kiB))
_, ok := c.Get(id2)
rtest.Assert(t, ok, "blob %v not present", id2)
_, ok = c.Get(id1)
rtest.Assert(t, !ok, "blob %v present, but should have been evicted", id1)
c.Add(id1, make([]byte, 1+c.size))
_, ok = c.Get(id1)
rtest.Assert(t, !ok, "blob %v too large but still added to cache")
c.c.Remove(id1)
c.c.Remove(id3)
c.c.Remove(id2)
rtest.Equals(t, cacheSize, c.size)
rtest.Equals(t, cacheSize, c.free)
}

View file

@ -5,6 +5,7 @@ import (
"io"
"path"
"github.com/restic/restic/internal/bloblru"
"github.com/restic/restic/internal/errors"
"github.com/restic/restic/internal/restic"
"github.com/restic/restic/internal/walker"
@ -20,7 +21,11 @@ type dumper interface {
// It will loop over all nodes in the tree and dump them recursively.
type WriteDump func(ctx context.Context, repo restic.Repository, tree *restic.Tree, rootPath string, dst io.Writer) error
func writeDump(ctx context.Context, repo restic.Repository, tree *restic.Tree, rootPath string, dmp dumper, dst io.Writer) error {
func NewCache() *bloblru.Cache {
return bloblru.New(64 << 20)
}
func writeDump(ctx context.Context, repo restic.Repository, tree *restic.Tree, rootPath string, dmp dumper) error {
for _, rootNode := range tree.Nodes {
rootNode.Path = rootPath
err := dumpTree(ctx, repo, rootNode, rootPath, dmp)
@ -71,20 +76,24 @@ func dumpTree(ctx context.Context, repo restic.Repository, rootNode *restic.Node
return err
}
// GetNodeData will write the contents of the node to the given output.
func GetNodeData(ctx context.Context, output io.Writer, repo restic.Repository, node *restic.Node) error {
// WriteNodeData writes the contents of the node to the given Writer.
func WriteNodeData(ctx context.Context, w io.Writer, repo restic.Repository, node *restic.Node, cache *bloblru.Cache) error {
var (
buf []byte
err error
)
for _, id := range node.Content {
buf, err = repo.LoadBlob(ctx, restic.DataBlob, id, buf)
if err != nil {
return err
blob, ok := cache.Get(id)
if !ok {
blob, err = repo.LoadBlob(ctx, restic.DataBlob, id, buf)
if err != nil {
return err
}
buf = cache.Add(id, blob) // Reuse evicted buffer.
}
_, err = output.Write(buf)
if err != nil {
if _, err := w.Write(blob); err != nil {
return errors.Wrap(err, "Write")
}
}

View file

@ -8,25 +8,29 @@ import (
"path/filepath"
"strings"
"github.com/restic/restic/internal/bloblru"
"github.com/restic/restic/internal/errors"
"github.com/restic/restic/internal/restic"
)
type tarDumper struct {
w *tar.Writer
cache *bloblru.Cache
w *tar.Writer
}
// Statically ensure that tarDumper implements dumper.
var _ dumper = tarDumper{}
var _ dumper = &tarDumper{}
// WriteTar will write the contents of the given tree, encoded as a tar to the given destination.
func WriteTar(ctx context.Context, repo restic.Repository, tree *restic.Tree, rootPath string, dst io.Writer) error {
dmp := tarDumper{w: tar.NewWriter(dst)}
return writeDump(ctx, repo, tree, rootPath, dmp, dst)
dmp := &tarDumper{
cache: NewCache(),
w: tar.NewWriter(dst),
}
return writeDump(ctx, repo, tree, rootPath, dmp)
}
func (dmp tarDumper) Close() error {
func (dmp *tarDumper) Close() error {
return dmp.w.Close()
}
@ -39,7 +43,7 @@ const (
cISVTX = 0o1000 // Save text (sticky bit)
)
func (dmp tarDumper) dumpNode(ctx context.Context, node *restic.Node, repo restic.Repository) error {
func (dmp *tarDumper) dumpNode(ctx context.Context, node *restic.Node, repo restic.Repository) error {
relPath, err := filepath.Rel("/", node.Path)
if err != nil {
return err
@ -90,7 +94,7 @@ func (dmp tarDumper) dumpNode(ctx context.Context, node *restic.Node, repo resti
return errors.Wrap(err, "TarHeader")
}
return GetNodeData(ctx, dmp.w, repo, node)
return WriteNodeData(ctx, dmp.w, repo, node, dmp.cache)
}
func parseXattrs(xattrs []restic.ExtendedAttribute) map[string]string {

View file

@ -6,29 +6,33 @@ import (
"io"
"path/filepath"
"github.com/restic/restic/internal/bloblru"
"github.com/restic/restic/internal/errors"
"github.com/restic/restic/internal/restic"
)
type zipDumper struct {
w *zip.Writer
cache *bloblru.Cache
w *zip.Writer
}
// Statically ensure that zipDumper implements dumper.
var _ dumper = zipDumper{}
var _ dumper = &zipDumper{}
// WriteZip will write the contents of the given tree, encoded as a zip to the given destination.
func WriteZip(ctx context.Context, repo restic.Repository, tree *restic.Tree, rootPath string, dst io.Writer) error {
dmp := zipDumper{w: zip.NewWriter(dst)}
return writeDump(ctx, repo, tree, rootPath, dmp, dst)
dmp := &zipDumper{
cache: NewCache(),
w: zip.NewWriter(dst),
}
return writeDump(ctx, repo, tree, rootPath, dmp)
}
func (dmp zipDumper) Close() error {
func (dmp *zipDumper) Close() error {
return dmp.w.Close()
}
func (dmp zipDumper) dumpNode(ctx context.Context, node *restic.Node, repo restic.Repository) error {
func (dmp *zipDumper) dumpNode(ctx context.Context, node *restic.Node, repo restic.Repository) error {
relPath, err := filepath.Rel("/", node.Path)
if err != nil {
return err
@ -58,5 +62,5 @@ func (dmp zipDumper) dumpNode(ctx context.Context, node *restic.Node, repo resti
return nil
}
return GetNodeData(ctx, w, repo, node)
return WriteNodeData(ctx, w, repo, node, dmp.cache)
}

View file

@ -96,7 +96,7 @@ func (f *file) Open(ctx context.Context, req *fuse.OpenRequest, resp *fuse.OpenR
func (f *openFile) getBlobAt(ctx context.Context, i int) (blob []byte, err error) {
blob, ok := f.root.blobCache.get(f.node.Content[i])
blob, ok := f.root.blobCache.Get(f.node.Content[i])
if ok {
return blob, nil
}
@ -107,7 +107,7 @@ func (f *openFile) getBlobAt(ctx context.Context, i int) (blob []byte, err error
return nil, err
}
f.root.blobCache.add(f.node.Content[i], blob)
f.root.blobCache.Add(f.node.Content[i], blob)
return blob, nil
}

View file

@ -1,3 +1,4 @@
//go:build darwin || freebsd || linux
// +build darwin freebsd linux
package fuse
@ -10,6 +11,7 @@ import (
"testing"
"time"
"github.com/restic/restic/internal/bloblru"
"github.com/restic/restic/internal/repository"
"github.com/restic/restic/internal/restic"
@ -19,48 +21,6 @@ import (
rtest "github.com/restic/restic/internal/test"
)
func TestCache(t *testing.T) {
var id1, id2, id3 restic.ID
id1[0] = 1
id2[0] = 2
id3[0] = 3
const (
kiB = 1 << 10
cacheSize = 64*kiB + 3*cacheOverhead
)
c := newBlobCache(cacheSize)
addAndCheck := func(id restic.ID, exp []byte) {
c.add(id, exp)
blob, ok := c.get(id)
rtest.Assert(t, ok, "blob %v added but not found in cache", id)
rtest.Equals(t, &exp[0], &blob[0])
rtest.Equals(t, exp, blob)
}
addAndCheck(id1, make([]byte, 32*kiB))
addAndCheck(id2, make([]byte, 30*kiB))
addAndCheck(id3, make([]byte, 10*kiB))
_, ok := c.get(id2)
rtest.Assert(t, ok, "blob %v not present", id2)
_, ok = c.get(id1)
rtest.Assert(t, !ok, "blob %v present, but should have been evicted", id1)
c.add(id1, make([]byte, 1+c.size))
_, ok = c.get(id1)
rtest.Assert(t, !ok, "blob %v too large but still added to cache")
c.c.Remove(id1)
c.c.Remove(id3)
c.c.Remove(id2)
rtest.Equals(t, cacheSize, c.size)
rtest.Equals(t, cacheSize, c.free)
}
func testRead(t testing.TB, f fs.Handle, offset, length int, data []byte) {
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
@ -156,7 +116,7 @@ func TestFuseFile(t *testing.T) {
Size: filesize,
Content: content,
}
root := &Root{repo: repo, blobCache: newBlobCache(blobCacheSize)}
root := &Root{repo: repo, blobCache: bloblru.New(blobCacheSize)}
inode := fs.GenerateDynamicInode(1, "foo")
f, err := newFile(context.TODO(), root, inode, node)
@ -191,7 +151,7 @@ func TestFuseDir(t *testing.T) {
repo, cleanup := repository.TestRepository(t)
defer cleanup()
root := &Root{repo: repo, blobCache: newBlobCache(blobCacheSize)}
root := &Root{repo: repo, blobCache: bloblru.New(blobCacheSize)}
node := &restic.Node{
Mode: 0755,

View file

@ -1,3 +1,4 @@
//go:build darwin || freebsd || linux
// +build darwin freebsd linux
package fuse
@ -6,6 +7,7 @@ import (
"os"
"time"
"github.com/restic/restic/internal/bloblru"
"github.com/restic/restic/internal/debug"
"github.com/restic/restic/internal/restic"
@ -27,7 +29,7 @@ type Root struct {
cfg Config
inode uint64
snapshots restic.Snapshots
blobCache *blobCache
blobCache *bloblru.Cache
snCount int
lastCheck time.Time
@ -54,7 +56,7 @@ func NewRoot(repo restic.Repository, cfg Config) *Root {
repo: repo,
inode: rootInode,
cfg: cfg,
blobCache: newBlobCache(blobCacheSize),
blobCache: bloblru.New(blobCacheSize),
}
if !cfg.OwnerIsRoot {