Implement directory include filtering for efficiency

Fixes #395
This commit is contained in:
Nick Craig-Wood 2016-05-16 17:14:04 +01:00
parent 536526cc92
commit ad85f6e413
5 changed files with 271 additions and 128 deletions

View file

@ -81,13 +81,32 @@ Special characters can be escaped with a `\` before them.
\*.jpg - matches "*.jpg"
\\.jpg - matches "\.jpg"
\[one\].jpg - matches "[one].jpg"
### Directories ###
Rclone keeps track of directories that could match any file patterns.
Eg if you add the include rule
\a\*.jpg
Rclone will synthesize the directory include rule
\a\
If you put any rules which end in `\` then it will only match
directories.
Directory matches are **only** used to optimise directory access
patterns - you must still match the files that you want to match.
Directory matches won't optimise anything on bucket based remotes (eg
s3, swift, google compute storage, b2) which don't have a concept of
directory.
### Differences between rsync and rclone patterns ###
Rclone implements bash style `{a,b,c}` glob matching which rsync doesn't.
Rclone ignores `/` at the end of a pattern.
Rclone always does a wildcard match so `\` must always escape a `\`.
## How the rules are used ##
@ -120,6 +139,11 @@ This would exclude
* `secret17.jpg`
* non `*.jpg` and `*.png`
A similar process is done on directory entries before recursing into
them. This only works on remotes which have a concept of directory
(Eg local, drive, onedrive, amazon cloud drive) and not on bucket
based remotes (eg s3, swift, google compute storage, b2).
## Adding filtering rules ##
Filtering rules are added with the following command line flags.

View file

@ -59,6 +59,40 @@ func (r *rule) String() string {
return fmt.Sprintf("%s %s", c, r.Regexp.String())
}
// rules is a slice of rules
type rules struct {
rules []rule
existing map[string]struct{}
}
// add adds a rule if it doesn't exist already
func (rs *rules) add(Include bool, re *regexp.Regexp) {
if rs.existing == nil {
rs.existing = make(map[string]struct{})
}
newRule := rule{
Include: Include,
Regexp: re,
}
newRuleString := newRule.String()
if _, ok := rs.existing[newRuleString]; ok {
return // rule already exists
}
rs.rules = append(rs.rules, newRule)
rs.existing[newRuleString] = struct{}{}
}
// clear clears all the rules
func (rs *rules) clear() {
rs.rules = nil
rs.existing = nil
}
// len returns the number of rules
func (rs *rules) len() int {
return len(rs.rules)
}
// filesMap describes the map of files to transfer
type filesMap map[string]struct{}
@ -69,7 +103,8 @@ type Filter struct {
MaxSize int64
ModTimeFrom time.Time
ModTimeTo time.Time
rules []rule
fileRules rules
dirRules rules
files filesMap // files if filesFrom
dirs filesMap // dirs from filesFrom
}
@ -172,7 +207,7 @@ func NewFilter() (f *Filter, err error) {
}
}
if addImplicitExclude {
err = f.Add(false, "*")
err = f.Add(false, "/**")
if err != nil {
return nil, err
}
@ -204,17 +239,49 @@ func NewFilter() (f *Filter, err error) {
return f, nil
}
// addDirGlobs adds directory globs from the file glob passed in
func (f *Filter) addDirGlobs(Include bool, glob string) error {
for _, dirGlob := range globToDirGlobs(glob) {
// Don't add "/" as we always include the root
if dirGlob == "/" {
continue
}
dirRe, err := globToRegexp(dirGlob)
if err != nil {
return err
}
f.dirRules.add(Include, dirRe)
}
return nil
}
// Add adds a filter rule with include or exclude status indicated
func (f *Filter) Add(Include bool, glob string) error {
isDirRule := strings.HasSuffix(glob, "/")
isFileRule := !isDirRule
if strings.HasSuffix(glob, "**") {
isDirRule, isFileRule = true, true
}
re, err := globToRegexp(glob)
if err != nil {
return err
}
rule := rule{
Include: Include,
Regexp: re,
if isFileRule {
f.fileRules.add(Include, re)
// If include rule work out what directories are needed to scan
// if exclude rule, we can't rule anything out
// Unless it is `*` which matches everything
// NB ** and /** are DirRules
if Include || glob == "*" {
err = f.addDirGlobs(Include, glob)
if err != nil {
return err
}
}
}
if isDirRule {
f.dirRules.add(Include, re)
}
f.rules = append(f.rules, rule)
return nil
}
@ -266,7 +333,8 @@ func (f *Filter) AddFile(file string) error {
// Clear clears all the filter rules
func (f *Filter) Clear() {
f.rules = nil
f.fileRules.clear()
f.dirRules.clear()
}
// InActive returns false if any filters are active
@ -276,12 +344,13 @@ func (f *Filter) InActive() bool {
f.ModTimeTo.IsZero() &&
f.MinSize == 0 &&
f.MaxSize == 0 &&
len(f.rules) == 0)
f.fileRules.len() == 0 &&
f.dirRules.len() == 0)
}
// includeRemote returns whether this remote passes the filter rules.
func (f *Filter) includeRemote(remote string) bool {
for _, rule := range f.rules {
for _, rule := range f.fileRules.rules {
if rule.Match(remote) {
return rule.Include
}
@ -298,7 +367,13 @@ func (f *Filter) IncludeDirectory(remote string) bool {
_, include := f.dirs[remote]
return include
}
return f.includeRemote(remote + "/")
remote += "/"
for _, rule := range f.dirRules.rules {
if rule.Match(remote) {
return rule.Include
}
}
return true
}
// Include returns whether this object should be included into the
@ -372,8 +447,13 @@ func (f *Filter) DumpFilters() string {
if !f.ModTimeTo.IsZero() {
rules = append(rules, fmt.Sprintf("Last-modified date must be equal or less than: %s", f.ModTimeTo.String()))
}
for _, rule := range f.rules {
rules = append(rules, "--- File filter rules ---")
for _, rule := range f.fileRules.rules {
rules = append(rules, rule.String())
}
rules = append(rules, "--- Directory filter rules ---")
for _, dirRule := range f.dirRules.rules {
rules = append(rules, dirRule.String())
}
return strings.Join(rules, "\n")
}

View file

@ -8,6 +8,7 @@ import (
"time"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
func TestAgeSuffix(t *testing.T) {
@ -46,27 +47,14 @@ func TestAgeSuffix(t *testing.T) {
func TestNewFilterDefault(t *testing.T) {
f, err := NewFilter()
if err != nil {
t.Fatal(err)
}
if f.DeleteExcluded != false {
t.Errorf("DeleteExcluded want false got %v", f.DeleteExcluded)
}
if f.MinSize != 0 {
t.Errorf("MinSize want 0 got %v", f.MinSize)
}
if f.MaxSize != 0 {
t.Errorf("MaxSize want 0 got %v", f.MaxSize)
}
if len(f.rules) != 0 {
t.Errorf("rules want non got %v", f.rules)
}
if f.files != nil {
t.Errorf("files want none got %v", f.files)
}
if !f.InActive() {
t.Errorf("want InActive")
}
require.NoError(t, err)
assert.False(t, f.DeleteExcluded)
assert.Equal(t, int64(0), f.MinSize)
assert.Equal(t, int64(0), f.MaxSize)
assert.Len(t, f.fileRules.rules, 0)
assert.Len(t, f.dirRules.rules, 0)
assert.Nil(t, f.files)
assert.True(t, f.InActive())
}
// return a pointer to the string
@ -77,9 +65,7 @@ func stringP(s string) *string {
// testFile creates a temp file with the contents
func testFile(t *testing.T, contents string) *string {
out, err := ioutil.TempFile("", "filter_test")
if err != nil {
t.Fatal(err)
}
require.NoError(t, err)
defer func() {
err := out.Close()
if err != nil {
@ -87,9 +73,7 @@ func testFile(t *testing.T, contents string) *string {
}
}()
_, err = out.Write([]byte(contents))
if err != nil {
t.Fatal(err)
}
require.NoError(t, err)
s := out.Name()
return &s
}
@ -138,20 +122,13 @@ func TestNewFilterFull(t *testing.T) {
}()
f, err := NewFilter()
if err != nil {
t.Fatal(err)
}
if f.DeleteExcluded != true {
t.Errorf("DeleteExcluded want true got %v", f.DeleteExcluded)
}
if f.MinSize != mins {
t.Errorf("MinSize want %v got %v", mins, f.MinSize)
}
if f.MaxSize != maxs {
t.Errorf("MaxSize want %v got %v", maxs, f.MaxSize)
}
require.NoError(t, err)
assert.True(t, f.DeleteExcluded)
assert.Equal(t, f.MinSize, mins)
assert.Equal(t, f.MaxSize, maxs)
got := f.DumpFilters()
want := `+ (^|/)include1$
want := `--- File filter rules ---
+ (^|/)include1$
+ (^|/)include2$
+ (^|/)include3$
- (^|/)exclude1$
@ -160,22 +137,19 @@ func TestNewFilterFull(t *testing.T) {
- (^|/)filter1$
+ (^|/)filter2$
- (^|/)filter3$
- (^|/)[^/]*$`
if got != want {
t.Errorf("rules want %s got %s", want, got)
}
if len(f.files) != 2 {
t.Errorf("files want 2 got %v", f.files)
}
- ^.*$
--- Directory filter rules ---
+ ^.*$
- ^.*$`
assert.Equal(t, want, got)
assert.Len(t, f.files, 2)
for _, name := range []string{"files1", "files2"} {
_, ok := f.files[name]
if !ok {
t.Errorf("Didn't find file %q in f.files", name)
}
}
if f.InActive() {
t.Errorf("want !InActive")
}
assert.False(t, f.InActive())
}
type includeTest struct {
@ -188,9 +162,7 @@ type includeTest struct {
func testInclude(t *testing.T, f *Filter, tests []includeTest) {
for _, test := range tests {
got := f.Include(test.in, test.size, time.Unix(test.modTime, 0))
if test.want != got {
t.Errorf("%q,%d,%d: want %v got %v", test.in, test.size, test.modTime, test.want, got)
}
assert.Equal(t, test.want, got, test.in, test.size, test.modTime)
}
}
@ -202,17 +174,13 @@ type includeDirTest struct {
func testDirInclude(t *testing.T, f *Filter, tests []includeDirTest) {
for _, test := range tests {
got := f.IncludeDirectory(test.in)
if test.want != got {
t.Errorf("%q: want %v got %v", test.in, test.want, got)
}
assert.Equal(t, test.want, got, test.in)
}
}
func TestNewFilterIncludeFiles(t *testing.T) {
f, err := NewFilter()
if err != nil {
t.Fatal(err)
}
require.NoError(t, err)
err = f.AddFile("file1.jpg")
if err != nil {
t.Error(err)
@ -239,9 +207,7 @@ func TestNewFilterIncludeFiles(t *testing.T) {
func TestNewFilterIncludeFilesDirs(t *testing.T) {
f, err := NewFilter()
if err != nil {
t.Fatal(err)
}
require.NoError(t, err)
for _, path := range []string{
"path/to/dir/file1.png",
"/path/to/dir/file2.png",
@ -275,9 +241,7 @@ func TestNewFilterIncludeFilesDirs(t *testing.T) {
func TestNewFilterMinSize(t *testing.T) {
f, err := NewFilter()
if err != nil {
t.Fatal(err)
}
require.NoError(t, err)
f.MinSize = 100
testInclude(t, f, []includeTest{
{"file1.jpg", 100, 0, true},
@ -291,9 +255,7 @@ func TestNewFilterMinSize(t *testing.T) {
func TestNewFilterMaxSize(t *testing.T) {
f, err := NewFilter()
if err != nil {
t.Fatal(err)
}
require.NoError(t, err)
f.MaxSize = 100
testInclude(t, f, []includeTest{
{"file1.jpg", 100, 0, true},
@ -307,9 +269,7 @@ func TestNewFilterMaxSize(t *testing.T) {
func TestNewFilterMinAndMaxAge(t *testing.T) {
f, err := NewFilter()
if err != nil {
t.Fatal(err)
}
require.NoError(t, err)
f.ModTimeFrom = time.Unix(1440000002, 0)
f.ModTimeTo = time.Unix(1440000003, 0)
testInclude(t, f, []includeTest{
@ -326,9 +286,7 @@ func TestNewFilterMinAndMaxAge(t *testing.T) {
func TestNewFilterMinAge(t *testing.T) {
f, err := NewFilter()
if err != nil {
t.Fatal(err)
}
require.NoError(t, err)
f.ModTimeTo = time.Unix(1440000002, 0)
testInclude(t, f, []includeTest{
{"file1.jpg", 100, 1440000000, true},
@ -344,9 +302,7 @@ func TestNewFilterMinAge(t *testing.T) {
func TestNewFilterMaxAge(t *testing.T) {
f, err := NewFilter()
if err != nil {
t.Fatal(err)
}
require.NoError(t, err)
f.ModTimeFrom = time.Unix(1440000002, 0)
testInclude(t, f, []includeTest{
{"file1.jpg", 100, 1440000000, false},
@ -362,25 +318,22 @@ func TestNewFilterMaxAge(t *testing.T) {
func TestNewFilterMatches(t *testing.T) {
f, err := NewFilter()
if err != nil {
t.Fatal(err)
}
require.NoError(t, err)
add := func(s string) {
err := f.AddRule(s)
if err != nil {
t.Fatal(err)
}
require.NoError(t, err)
}
add("+ cleared")
add("!")
add("- file1.jpg")
add("+ file2.png")
add("+ *.jpg")
add("- *.png")
add("- /file1.jpg")
add("+ /file2.png")
add("+ /*.jpg")
add("- /*.png")
add("- /potato")
add("+ /sausage1")
add("+ /sausage2*")
add("+ /sausage3**")
add("+ /a/*.jpg")
add("- *")
testInclude(t, f, []includeTest{
{"cleared", 100, 0, false},
@ -395,8 +348,11 @@ func TestNewFilterMatches(t *testing.T) {
{"sausage2potato", 101, 0, true},
{"sausage2/potato", 101, 0, false},
{"sausage3/potato", 101, 0, true},
{"a/one.jpg", 101, 0, true},
{"a/one.png", 101, 0, false},
{"unicorn", 99, 0, false},
})
t.Log(f.DumpFilters())
testDirInclude(t, f, []includeDirTest{
{"sausage1", false},
{"sausage2", false},
@ -406,6 +362,7 @@ func TestNewFilterMatches(t *testing.T) {
{"sausage3/sub", true},
{"sausage3/sub/dir", true},
{"sausage4", false},
{"a", true},
})
if f.InActive() {
t.Errorf("want !InActive")
@ -480,17 +437,11 @@ func TestFilterMatchesFromDocs(t *testing.T) {
{"\\[one\\].jpg", true, "[one].jpg"},
} {
f, err := NewFilter()
if err != nil {
t.Fatal(err)
}
require.NoError(t, err)
err = f.Add(true, test.glob)
if err != nil {
t.Fatal(err)
}
require.NoError(t, err)
err = f.Add(false, "*")
if err != nil {
t.Fatal(err)
}
require.NoError(t, err)
included := f.Include(test.file, 0, time.Unix(0, 0))
if included != test.included {
t.Logf("%q match %q: want %v got %v", test.glob, test.file, test.included, included)

View file

@ -115,3 +115,51 @@ func globToRegexp(glob string) (*regexp.Regexp, error) {
}
return result, nil
}
var (
// Can't deal with / or ** in {}
tooHardRe = regexp.MustCompile(`{[^{}]*(\*\*|/)[^{}]*}`)
// Squash all /
squashSlash = regexp.MustCompile(`/{2,}`)
)
// globToDirGlobs takes a file glob and turns it into a series of
// directory globs. When matched with a directory (with a trailing /)
// this should answer the question as to whether this glob could be in
// this directory.
func globToDirGlobs(glob string) (out []string) {
if tooHardRe.MatchString(glob) {
// Can't figure this one out so return any directory might match
out = append(out, "/**")
return out
}
// Get rid of multiple /s
glob = squashSlash.ReplaceAllString(glob, "/")
// Split on / or **
// (** can contain /)
for {
i := strings.LastIndex(glob, "/")
j := strings.LastIndex(glob, "**")
what := ""
if j > i {
i = j
what = "**"
}
if i < 0 {
if len(out) == 0 {
out = append(out, "/**")
}
break
}
glob = glob[:i]
newGlob := glob + what + "/"
if len(out) == 0 || out[len(out)-1] != newGlob {
out = append(out, newGlob)
}
}
return out
}

View file

@ -1,8 +1,10 @@
package fs
import (
"strings"
"testing"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
func TestGlobToRegexp(t *testing.T) {
@ -41,24 +43,62 @@ func TestGlobToRegexp(t *testing.T) {
} {
gotRe, err := globToRegexp(test.in)
if test.error == "" {
if err != nil {
t.Errorf("%q: not expecting error: %v", test.in, err)
} else {
got := gotRe.String()
if test.want != got {
t.Errorf("%q: want %q got %q", test.in, test.want, got)
}
}
got := gotRe.String()
require.NoError(t, err, test.in)
assert.Equal(t, test.want, got, test.in)
} else {
if err == nil {
t.Errorf("%q: expecting error but didn't get one", test.in)
} else {
got := err.Error()
if !strings.Contains(got, test.error) {
t.Errorf("%q: want error %q got %q", test.in, test.error, got)
}
}
require.Error(t, err, test.in)
assert.Contains(t, err.Error(), test.error, test.in)
assert.Nil(t, gotRe)
}
}
}
func TestGlobToDirGlobs(t *testing.T) {
for _, test := range []struct {
in string
want []string
}{
{`*`, []string{"/**"}},
{`/*`, []string{"/"}},
{`*.jpg`, []string{"/**"}},
{`/*.jpg`, []string{"/"}},
{`//*.jpg`, []string{"/"}},
{`///*.jpg`, []string{"/"}},
{`/a/*.jpg`, []string{"/a/", "/"}},
{`/a//*.jpg`, []string{"/a/", "/"}},
{`/a///*.jpg`, []string{"/a/", "/"}},
{`/a/b/*.jpg`, []string{"/a/b/", "/a/", "/"}},
{`a/*.jpg`, []string{"a/"}},
{`a/b/*.jpg`, []string{"a/b/", "a/"}},
{`*/*/*.jpg`, []string{"*/*/", "*/"}},
{`a/b/`, []string{"a/b/", "a/"}},
{`a/b`, []string{"a/"}},
{`a/b/*.{jpg,png,gif}`, []string{"a/b/", "a/"}},
{`/a/{jpg,png,gif}/*.{jpg,png,gif}`, []string{"/a/{jpg,png,gif}/", "/a/", "/"}},
{`a/{a,a*b,a**c}/d/`, []string{"/**"}},
{`/a/{a,a*b,a/c,d}/d/`, []string{"/**"}},
{`**`, []string{"**/"}},
{`a**`, []string{"a**/"}},
{`a**b`, []string{"a**/"}},
{`a**b**c**d`, []string{"a**b**c**/", "a**b**/", "a**/"}},
{`a**b/c**d`, []string{"a**b/c**/", "a**b/", "a**/"}},
{`/A/a**b/B/c**d/C/`, []string{"/A/a**b/B/c**d/C/", "/A/a**b/B/c**d/", "/A/a**b/B/c**/", "/A/a**b/B/", "/A/a**b/", "/A/a**/", "/A/", "/"}},
{`/var/spool/**/ncw`, []string{"/var/spool/**/", "/var/spool/", "/var/", "/"}},
{`var/spool/**/ncw/`, []string{"var/spool/**/ncw/", "var/spool/**/", "var/spool/", "var/"}},
{"/file1.jpg", []string{`/`}},
{"/file2.png", []string{`/`}},
{"/*.jpg", []string{`/`}},
{"/*.png", []string{`/`}},
{"/potato", []string{`/`}},
{"/sausage1", []string{`/`}},
{"/sausage2*", []string{`/`}},
{"/sausage3**", []string{`/sausage3**/`, "/"}},
{"/a/*.jpg", []string{`/a/`, "/"}},
} {
_, err := globToRegexp(test.in)
assert.NoError(t, err)
got := globToDirGlobs(test.in)
assert.Equal(t, test.want, got, test.in)
}
}