From 07133b892d5aa826b7fd7ef8aa66e973bf82a3ef Mon Sep 17 00:00:00 2001 From: Nick Craig-Wood Date: Thu, 29 Jun 2023 17:38:18 +0100 Subject: [PATCH] dirtree: fix performance with large directories of directories and --fast-list MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Before this change if using --fast-list on a directory with more than a few thousand directories in it DirTree.CheckParents became very slow taking up to 24 hours for a directory with 1,000,000 directories in it. This is because it becomes an O(N²) operation as DirTree.Find has to search each directory in a linear fashion as it is stored as a slice. This patch fixes the problem by scanning the DirTree for directories before starting the CheckParents process so it never has to call DirTree.Find. After the fix calling DirTree.CheckParents on a directory with 1,000,000 directories in it will take about 1 second. Anything which calls DirTree.Find can potentially have bad performance so in the future we should redesign the DirTree to use a different underlying datastructure or have an index. https://forum.rclone.org/t/almost-24-hours-cpu-compute-time-during-sync-between-two-large-s3-buckets/39375/ --- fs/dirtree/dirtree.go | 55 +++++++++++++++++++++++++++++--------- fs/dirtree/dirtree_test.go | 21 ++++++++++++++- 2 files changed, 63 insertions(+), 13 deletions(-) diff --git a/fs/dirtree/dirtree.go b/fs/dirtree/dirtree.go index 6b4be8b6d..70344ab34 100644 --- a/fs/dirtree/dirtree.go +++ b/fs/dirtree/dirtree.go @@ -63,10 +63,12 @@ func (dt DirTree) AddEntry(entry fs.DirEntry) { panic("unknown entry type") } remoteParent := parentDir(entry.Remote()) - dt.CheckParent("", remoteParent) + dt.checkParent("", remoteParent, nil) } // Find returns the DirEntry for filePath or nil if not found +// +// None that Find does a O(N) search so can be slow func (dt DirTree) Find(filePath string) (parentPath string, entry fs.DirEntry) { parentPath = parentDir(filePath) for _, entry := range dt[parentPath] { @@ -77,23 +79,52 @@ func (dt DirTree) Find(filePath string) (parentPath string, entry fs.DirEntry) { return parentPath, nil } -// CheckParent checks that dirPath has a *Dir in its parent -func (dt DirTree) CheckParent(root, dirPath string) { - if dirPath == root { - return +// checkParent checks that dirPath has a *Dir in its parent +// +// If dirs is not nil it must contain entries for every *Dir found in +// the tree. It is used to speed up the checking when calling this +// repeatedly. +func (dt DirTree) checkParent(root, dirPath string, dirs map[string]struct{}) { + var parentPath string + for { + if dirPath == root { + return + } + // Can rely on dirs to have all directories in it so + // we don't need to call Find. + if dirs != nil { + if _, found := dirs[dirPath]; found { + return + } + parentPath = parentDir(dirPath) + } else { + var entry fs.DirEntry + parentPath, entry = dt.Find(dirPath) + if entry != nil { + return + } + } + dt[parentPath] = append(dt[parentPath], fs.NewDir(dirPath, time.Now())) + if dirs != nil { + dirs[dirPath] = struct{}{} + } + dirPath = parentPath } - parentPath, entry := dt.Find(dirPath) - if entry != nil { - return - } - dt[parentPath] = append(dt[parentPath], fs.NewDir(dirPath, time.Now())) - dt.CheckParent(root, parentPath) } // CheckParents checks every directory in the tree has *Dir in its parent func (dt DirTree) CheckParents(root string) { + dirs := make(map[string]struct{}) + // Find all the directories and stick them in dirs + for _, entries := range dt { + for _, entry := range entries { + if _, ok := entry.(fs.Directory); ok { + dirs[entry.Remote()] = struct{}{} + } + } + } for dirPath := range dt { - dt.CheckParent(root, dirPath) + dt.checkParent(root, dirPath, dirs) } } diff --git a/fs/dirtree/dirtree_test.go b/fs/dirtree/dirtree_test.go index d548d8c6f..81930fc1a 100644 --- a/fs/dirtree/dirtree_test.go +++ b/fs/dirtree/dirtree_test.go @@ -1,6 +1,7 @@ package dirtree import ( + "fmt" "testing" "github.com/rclone/rclone/fstest/mockdir" @@ -108,7 +109,7 @@ func TestDirTreeCheckParent(t *testing.T) { sausage `, dt.String()) - dt.CheckParent("", "dir/subdir") + dt.checkParent("", "dir/subdir", nil) assert.Equal(t, `/ dir/ @@ -200,3 +201,21 @@ dir2/ `, dt.String()) } + +func BenchmarkCheckParents(b *testing.B) { + for _, N := range []int{1e2, 1e3, 1e4, 1e5, 1e6} { + b.Run(fmt.Sprintf("%d", N), func(b *testing.B) { + b.StopTimer() + dt := New() + for i := 0; i < N; i++ { + remote := fmt.Sprintf("dir%09d/file%09d.txt", i, 1) + o := mockobject.New(remote) + dt.Add(o) + } + b.StartTimer() + for n := 0; n < b.N; n++ { + dt.CheckParents("") + } + }) + } +}