diff --git a/docs/content/filtering.md b/docs/content/filtering.md index cc6f2e426..9ac37256d 100644 --- a/docs/content/filtering.md +++ b/docs/content/filtering.md @@ -33,6 +33,9 @@ you expect. Instead use a `--filter...` flag. ### Pattern syntax +Here is a formal definition of the pattern syntax, +[examples](#examples) are below. + Rclone matching rules follow a glob style: * matches any sequence of non-separator (/) characters @@ -42,8 +45,10 @@ Rclone matching rules follow a glob style: character class (must be non-empty) { pattern-list } pattern alternatives + {{ regexp }} + regular expression to match c matches character c (c != *, **, ?, \, [, {, }) - \c matches reserved character c (c = *, **, ?, \, [, {, }) + \c matches reserved character c (c = *, **, ?, \, [, {, }) or character class character-range: @@ -62,6 +67,10 @@ character classes (see [Go regular expression reference](https://golang.org/pkg/ Perl character classes (e.g. \s, \S, \w, \W) ASCII character classes (e.g. [[:alnum:]], [[:alpha:]], [[:punct:]], [[:xdigit:]]) +regexp for advanced users to insert a regular expression - see [below](#regexp) for more info: + + Any re2 regular expression not containing `}}` + If the filter pattern starts with a `/` then it only matches at the top level of the directory tree, **relative to the root of the remote** (not necessarily the root @@ -111,6 +120,75 @@ With `--ignore-case` potato - matches "potato" - matches "POTATO" +## Using regular expressions in filter patterns {#regexp} + +The syntax of filter patterns is glob style matching (like `bash` +uses) to make things easy for users. However this does not provide +absolute control over the matching, so for advanced users rclone also +provides a regular expression syntax. + +The regular expressions used are as defined in the [Go regular +expression reference](https://golang.org/pkg/regexp/syntax/). Regular +expressions should be enclosed in `{{` `}}`. They will match only the +last path segment if the glob doesn't start with `/` or the whole path +name if it does. + +Here is how the `{{regexp}}` is transformed into an full regular +expression to match the entire path: + + {{regexp}} becomes (^|/)(regexp)$ + /{{regexp}} becomes ^(regexp)$ + +Regexp syntax can be mixed with glob syntax, for example + + *.{{jpe?g}} to match file.jpg, file.jpeg but not file.png + +You can also use regexp flags - to set case insensitive, for example + + *.{{(?i)jpg}} to match file.jpg, file.JPG but not file.png + +Be careful with wildcards in regular expressions - you don't want them +to match path separators normally. To match any file name starting +with `start` and ending with `end` write + + {{start[^/]*end\.jpg}} + +Not + + {{start.*end\.jpg}} + +Which will match a directory called `start` with a file called +`end.jpg` in it as the `.*` will match `/` characters. + +Note that you can use `-vv --dump filters` to show the filter patterns +in regexp format - rclone implements the glob patters by transforming +them into regular expressions. + +## Filter pattern examples {#examples} + +| Description | Pattern | Matches | Does not match | +| ----------- |-------- | ------- | -------------- | +| Wildcard | `*.jpg` | `/file.jpg` | `/file.png` | +| | | `/dir/file.jpg` | `/dir/file.png` | +| Rooted | `/*.jpg` | `/file.jpg` | `/file.png` | +| | | `/file2.jpg` | `/dir/file.jpg` | +| Alternates | `*.{jpg,png}` | `/file.jpg` | `/file.gif` | +| | | `/dir/file.gif` | `/dir/file.gif` | +| Path Wildcard | `dir/**` | `/dir/anyfile` | `file.png` | +| | | `/subdir/dir/subsubdir/anyfile` | `/subdir/file.png` | +| Any Char | `*.t?t` | `/file.txt` | `/file.qxt` | +| | | `/dir/file.tzt` | `/dir/file.png` | +| Range | `*.[a-z]` | `/file.a` | `/file.0` | +| | | `/dir/file.b` | `/dir/file.1` | +| Escape | `*.\?\?\?` | `/file.???` | `/file.abc` | +| | | `/dir/file.???` | `/dir/file.def` | +| Class | `*.\d\d\d` | `/file.012` | `/file.abc` | +| | | `/dir/file.345` | `/dir/file.def` | +| Regexp | `*.{{jpe?g}}` | `/file.jpeg` | `/file.png` | +| | | `/dir/file.jpg` | `/dir/file.jpeeg` | +| Rooted Regexp | `/{{.*\.jpe?g}}` | `/file.jpeg` | `/file.png` | +| | | `/file.jpg` | `/dir/file.jpg` | + ## How filter rules are applied to files Rclone path/file name filters are made up of one or more of the following flags: diff --git a/fs/filter/filter_test.go b/fs/filter/filter_test.go index 3d5439fe2..81a01af3d 100644 --- a/fs/filter/filter_test.go +++ b/fs/filter/filter_test.go @@ -503,6 +503,31 @@ func TestNewFilterMatchesIgnoreCase(t *testing.T) { assert.False(t, f.InActive()) } +func TestNewFilterMatchesRegexp(t *testing.T) { + f, err := NewFilter(nil) + require.NoError(t, err) + add := func(s string) { + err := f.AddRule(s) + require.NoError(t, err) + } + add(`+ /{{file\d+\.png}}`) + add(`+ *.{{(?i)jpg}}`) + add(`- *`) + testInclude(t, f, []includeTest{ + {"file2.png", 100, 0, true}, + {"sub/file2.png", 100, 0, false}, + {"file123.png", 100, 0, true}, + {"File123.png", 100, 0, false}, + {"something.jpg", 100, 0, true}, + {"deep/path/something.JPG", 100, 0, true}, + {"something.gif", 100, 0, false}, + }) + testDirInclude(t, f, []includeDirTest{ + {"anything at all", true}, + }) + assert.False(t, f.InActive()) +} + func TestFilterAddDirRuleOrFileRule(t *testing.T) { for _, test := range []struct { included bool diff --git a/fs/filter/glob.go b/fs/filter/glob.go index fdaf96bdb..350180d3f 100644 --- a/fs/filter/glob.go +++ b/fs/filter/glob.go @@ -19,7 +19,7 @@ func GlobToRegexp(glob string, ignoreCase bool) (*regexp.Regexp, error) { } if strings.HasPrefix(glob, "/") { glob = glob[1:] - _, _ = re.WriteRune('^') + _ = re.WriteByte('^') } else { _, _ = re.WriteString("(^|/)") } @@ -38,15 +38,45 @@ func GlobToRegexp(glob string, ignoreCase bool) (*regexp.Regexp, error) { consecutiveStars = 0 return nil } + overwriteLastChar := func(c byte) { + buf := re.Bytes() + buf[len(buf)-1] = c + } inBraces := false inBrackets := 0 slashed := false + inRegexp := false // inside {{ ... }} + inRegexpEnd := false // have received }} waiting for more + var next, last rune for _, c := range glob { + next, last = c, next if slashed { _, _ = re.WriteRune(c) slashed = false continue } + if inRegexpEnd { + if c == '}' { + // Regexp is ending with }} choose longest segment + // Replace final ) with } + overwriteLastChar('}') + _ = re.WriteByte(')') + continue + } else { + inRegexpEnd = false + } + } + if inRegexp { + if c == '}' && last == '}' { + inRegexp = false + inRegexpEnd = true + // Replace final } with ) + overwriteLastChar(')') + } else { + _, _ = re.WriteRune(c) + } + continue + } if c != '*' { err := insertStars() if err != nil { @@ -78,24 +108,30 @@ func GlobToRegexp(glob string, ignoreCase bool) (*regexp.Regexp, error) { return nil, fmt.Errorf("mismatched ']' in glob %q", glob) case '{': if inBraces { - return nil, fmt.Errorf("can't nest '{' '}' in glob %q", glob) + if last == '{' { + inRegexp = true + inBraces = false + } else { + return nil, fmt.Errorf("can't nest '{' '}' in glob %q", glob) + } + } else { + inBraces = true + _ = re.WriteByte('(') } - inBraces = true - _, _ = re.WriteRune('(') case '}': if !inBraces { return nil, fmt.Errorf("mismatched '{' and '}' in glob %q", glob) } - _, _ = re.WriteRune(')') + _ = re.WriteByte(')') inBraces = false case ',': if inBraces { - _, _ = re.WriteRune('|') + _ = re.WriteByte('|') } else { _, _ = re.WriteRune(c) } case '.', '+', '(', ')', '|', '^', '$': // regexp meta characters not dealt with above - _, _ = re.WriteRune('\\') + _ = re.WriteByte('\\') _, _ = re.WriteRune(c) default: _, _ = re.WriteRune(c) @@ -111,7 +147,10 @@ func GlobToRegexp(glob string, ignoreCase bool) (*regexp.Regexp, error) { if inBraces { return nil, fmt.Errorf("mismatched '{' and '}' in glob %q", glob) } - _, _ = re.WriteRune('$') + if inRegexp { + return nil, fmt.Errorf("mismatched '{{' and '}}' in glob %q", glob) + } + _ = re.WriteByte('$') result, err := regexp.Compile(re.String()) if err != nil { return nil, fmt.Errorf("bad glob pattern %q (regexp %q): %w", glob, re.String(), err) @@ -120,8 +159,10 @@ func GlobToRegexp(glob string, ignoreCase bool) (*regexp.Regexp, error) { } var ( - // Can't deal with / or ** in {} - tooHardRe = regexp.MustCompile(`{[^{}]*(\*\*|/)[^{}]*}`) + // Can't deal with + // / or ** in {} + // {{ regexp }} + tooHardRe = regexp.MustCompile(`({[^{}]*(\*\*|/)[^{}]*})|\{\{|\}\}`) // Squash all / squashSlash = regexp.MustCompile(`/{2,}`) diff --git a/fs/filter/glob_test.go b/fs/filter/glob_test.go index 351f93321..5946a5a0a 100644 --- a/fs/filter/glob_test.go +++ b/fs/filter/glob_test.go @@ -32,7 +32,7 @@ func TestGlobToRegexp(t *testing.T) { {`***`, `(^|/)`, `too many stars`}, {`ab]c`, `(^|/)`, `mismatched ']'`}, {`ab[c`, `(^|/)`, `mismatched '[' and ']'`}, - {`ab{{cd`, `(^|/)`, `can't nest`}, + {`ab{x{cd`, `(^|/)`, `can't nest`}, {`ab{}}cd`, `(^|/)`, `mismatched '{' and '}'`}, {`ab}c`, `(^|/)`, `mismatched '{' and '}'`}, {`ab{c`, `(^|/)`, `mismatched '{' and '}'`}, @@ -40,16 +40,24 @@ func TestGlobToRegexp(t *testing.T) { {`[a--b]`, `(^|/)`, `bad glob pattern`}, {`a\*b`, `(^|/)a\*b$`, ``}, {`a\\b`, `(^|/)a\\b$`, ``}, + {`a{{.*}}b`, `(^|/)a(.*)b$`, ``}, + {`a{{.*}`, `(^|/)a(.*)b$`, `mismatched '{{' and '}}'`}, + {`{{regexp}}`, `(^|/)(regexp)$`, ``}, + {`\{{{regexp}}`, `(^|/)\{(regexp)$`, ``}, + {`/{{regexp}}`, `^(regexp)$`, ``}, + {`/{{\d{8}}}`, `^(\d{8})$`, ``}, + {`/{{\}}}`, `^(\})$`, ``}, + {`{{(?i)regexp}}`, `(^|/)((?i)regexp)$`, ``}, } { for _, ignoreCase := range []bool{false, true} { gotRe, err := GlobToRegexp(test.in, ignoreCase) if test.error == "" { + require.NoError(t, err, test.in) prefix := "" if ignoreCase { prefix = "(?i)" } got := gotRe.String() - require.NoError(t, err, test.in) assert.Equal(t, prefix+test.want, got, test.in) } else { require.Error(t, err, test.in) @@ -84,6 +92,7 @@ func TestGlobToDirGlobs(t *testing.T) { {`/a/{jpg,png,gif}/*.{jpg,png,gif}`, []string{"/a/{jpg,png,gif}/", "/a/", "/"}}, {`a/{a,a*b,a**c}/d/`, []string{"/**"}}, {`/a/{a,a*b,a/c,d}/d/`, []string{"/**"}}, + {`/a/{{.*}}/d/`, []string{"/**"}}, {`**`, []string{"**/"}}, {`a**`, []string{"a**/"}}, {`a**b`, []string{"a**/"}},