filter: add {{ regexp }} syntax to pattern matches - fixes #4074

There has been a desire from more advanced rclone users to have regexp
filtering as well as the glob filtering.

This patch adds regexp filtering using this syntax `{{ regexp }}`
which is currently a syntax error, so is backwards compatibile.

This means regexps can be used everywhere globs can be used, and that
they also can be mixed with globs in the same pattern, eg `*.{{jpe?g}}`
This commit is contained in:
Nick Craig-Wood 2021-10-09 12:56:23 +01:00
parent 74898bac3b
commit 268b808bf8
4 changed files with 166 additions and 13 deletions

View file

@ -33,6 +33,9 @@ you expect. Instead use a `--filter...` flag.
### Pattern syntax
Here is a formal definition of the pattern syntax,
[examples](#examples) are below.
Rclone matching rules follow a glob style:
* matches any sequence of non-separator (/) characters
@ -42,8 +45,10 @@ Rclone matching rules follow a glob style:
character class (must be non-empty)
{ pattern-list }
pattern alternatives
{{ regexp }}
regular expression to match
c matches character c (c != *, **, ?, \, [, {, })
\c matches reserved character c (c = *, **, ?, \, [, {, })
\c matches reserved character c (c = *, **, ?, \, [, {, }) or character class
character-range:
@ -62,6 +67,10 @@ character classes (see [Go regular expression reference](https://golang.org/pkg/
Perl character classes (e.g. \s, \S, \w, \W)
ASCII character classes (e.g. [[:alnum:]], [[:alpha:]], [[:punct:]], [[:xdigit:]])
regexp for advanced users to insert a regular expression - see [below](#regexp) for more info:
Any re2 regular expression not containing `}}`
If the filter pattern starts with a `/` then it only matches
at the top level of the directory tree,
**relative to the root of the remote** (not necessarily the root
@ -111,6 +120,75 @@ With `--ignore-case`
potato - matches "potato"
- matches "POTATO"
## Using regular expressions in filter patterns {#regexp}
The syntax of filter patterns is glob style matching (like `bash`
uses) to make things easy for users. However this does not provide
absolute control over the matching, so for advanced users rclone also
provides a regular expression syntax.
The regular expressions used are as defined in the [Go regular
expression reference](https://golang.org/pkg/regexp/syntax/). Regular
expressions should be enclosed in `{{` `}}`. They will match only the
last path segment if the glob doesn't start with `/` or the whole path
name if it does.
Here is how the `{{regexp}}` is transformed into an full regular
expression to match the entire path:
{{regexp}} becomes (^|/)(regexp)$
/{{regexp}} becomes ^(regexp)$
Regexp syntax can be mixed with glob syntax, for example
*.{{jpe?g}} to match file.jpg, file.jpeg but not file.png
You can also use regexp flags - to set case insensitive, for example
*.{{(?i)jpg}} to match file.jpg, file.JPG but not file.png
Be careful with wildcards in regular expressions - you don't want them
to match path separators normally. To match any file name starting
with `start` and ending with `end` write
{{start[^/]*end\.jpg}}
Not
{{start.*end\.jpg}}
Which will match a directory called `start` with a file called
`end.jpg` in it as the `.*` will match `/` characters.
Note that you can use `-vv --dump filters` to show the filter patterns
in regexp format - rclone implements the glob patters by transforming
them into regular expressions.
## Filter pattern examples {#examples}
| Description | Pattern | Matches | Does not match |
| ----------- |-------- | ------- | -------------- |
| Wildcard | `*.jpg` | `/file.jpg` | `/file.png` |
| | | `/dir/file.jpg` | `/dir/file.png` |
| Rooted | `/*.jpg` | `/file.jpg` | `/file.png` |
| | | `/file2.jpg` | `/dir/file.jpg` |
| Alternates | `*.{jpg,png}` | `/file.jpg` | `/file.gif` |
| | | `/dir/file.gif` | `/dir/file.gif` |
| Path Wildcard | `dir/**` | `/dir/anyfile` | `file.png` |
| | | `/subdir/dir/subsubdir/anyfile` | `/subdir/file.png` |
| Any Char | `*.t?t` | `/file.txt` | `/file.qxt` |
| | | `/dir/file.tzt` | `/dir/file.png` |
| Range | `*.[a-z]` | `/file.a` | `/file.0` |
| | | `/dir/file.b` | `/dir/file.1` |
| Escape | `*.\?\?\?` | `/file.???` | `/file.abc` |
| | | `/dir/file.???` | `/dir/file.def` |
| Class | `*.\d\d\d` | `/file.012` | `/file.abc` |
| | | `/dir/file.345` | `/dir/file.def` |
| Regexp | `*.{{jpe?g}}` | `/file.jpeg` | `/file.png` |
| | | `/dir/file.jpg` | `/dir/file.jpeeg` |
| Rooted Regexp | `/{{.*\.jpe?g}}` | `/file.jpeg` | `/file.png` |
| | | `/file.jpg` | `/dir/file.jpg` |
## How filter rules are applied to files
Rclone path/file name filters are made up of one or more of the following flags:

View file

@ -503,6 +503,31 @@ func TestNewFilterMatchesIgnoreCase(t *testing.T) {
assert.False(t, f.InActive())
}
func TestNewFilterMatchesRegexp(t *testing.T) {
f, err := NewFilter(nil)
require.NoError(t, err)
add := func(s string) {
err := f.AddRule(s)
require.NoError(t, err)
}
add(`+ /{{file\d+\.png}}`)
add(`+ *.{{(?i)jpg}}`)
add(`- *`)
testInclude(t, f, []includeTest{
{"file2.png", 100, 0, true},
{"sub/file2.png", 100, 0, false},
{"file123.png", 100, 0, true},
{"File123.png", 100, 0, false},
{"something.jpg", 100, 0, true},
{"deep/path/something.JPG", 100, 0, true},
{"something.gif", 100, 0, false},
})
testDirInclude(t, f, []includeDirTest{
{"anything at all", true},
})
assert.False(t, f.InActive())
}
func TestFilterAddDirRuleOrFileRule(t *testing.T) {
for _, test := range []struct {
included bool

View file

@ -19,7 +19,7 @@ func GlobToRegexp(glob string, ignoreCase bool) (*regexp.Regexp, error) {
}
if strings.HasPrefix(glob, "/") {
glob = glob[1:]
_, _ = re.WriteRune('^')
_ = re.WriteByte('^')
} else {
_, _ = re.WriteString("(^|/)")
}
@ -38,15 +38,45 @@ func GlobToRegexp(glob string, ignoreCase bool) (*regexp.Regexp, error) {
consecutiveStars = 0
return nil
}
overwriteLastChar := func(c byte) {
buf := re.Bytes()
buf[len(buf)-1] = c
}
inBraces := false
inBrackets := 0
slashed := false
inRegexp := false // inside {{ ... }}
inRegexpEnd := false // have received }} waiting for more
var next, last rune
for _, c := range glob {
next, last = c, next
if slashed {
_, _ = re.WriteRune(c)
slashed = false
continue
}
if inRegexpEnd {
if c == '}' {
// Regexp is ending with }} choose longest segment
// Replace final ) with }
overwriteLastChar('}')
_ = re.WriteByte(')')
continue
} else {
inRegexpEnd = false
}
}
if inRegexp {
if c == '}' && last == '}' {
inRegexp = false
inRegexpEnd = true
// Replace final } with )
overwriteLastChar(')')
} else {
_, _ = re.WriteRune(c)
}
continue
}
if c != '*' {
err := insertStars()
if err != nil {
@ -78,24 +108,30 @@ func GlobToRegexp(glob string, ignoreCase bool) (*regexp.Regexp, error) {
return nil, fmt.Errorf("mismatched ']' in glob %q", glob)
case '{':
if inBraces {
return nil, fmt.Errorf("can't nest '{' '}' in glob %q", glob)
if last == '{' {
inRegexp = true
inBraces = false
} else {
return nil, fmt.Errorf("can't nest '{' '}' in glob %q", glob)
}
} else {
inBraces = true
_ = re.WriteByte('(')
}
inBraces = true
_, _ = re.WriteRune('(')
case '}':
if !inBraces {
return nil, fmt.Errorf("mismatched '{' and '}' in glob %q", glob)
}
_, _ = re.WriteRune(')')
_ = re.WriteByte(')')
inBraces = false
case ',':
if inBraces {
_, _ = re.WriteRune('|')
_ = re.WriteByte('|')
} else {
_, _ = re.WriteRune(c)
}
case '.', '+', '(', ')', '|', '^', '$': // regexp meta characters not dealt with above
_, _ = re.WriteRune('\\')
_ = re.WriteByte('\\')
_, _ = re.WriteRune(c)
default:
_, _ = re.WriteRune(c)
@ -111,7 +147,10 @@ func GlobToRegexp(glob string, ignoreCase bool) (*regexp.Regexp, error) {
if inBraces {
return nil, fmt.Errorf("mismatched '{' and '}' in glob %q", glob)
}
_, _ = re.WriteRune('$')
if inRegexp {
return nil, fmt.Errorf("mismatched '{{' and '}}' in glob %q", glob)
}
_ = re.WriteByte('$')
result, err := regexp.Compile(re.String())
if err != nil {
return nil, fmt.Errorf("bad glob pattern %q (regexp %q): %w", glob, re.String(), err)
@ -120,8 +159,10 @@ func GlobToRegexp(glob string, ignoreCase bool) (*regexp.Regexp, error) {
}
var (
// Can't deal with / or ** in {}
tooHardRe = regexp.MustCompile(`{[^{}]*(\*\*|/)[^{}]*}`)
// Can't deal with
// / or ** in {}
// {{ regexp }}
tooHardRe = regexp.MustCompile(`({[^{}]*(\*\*|/)[^{}]*})|\{\{|\}\}`)
// Squash all /
squashSlash = regexp.MustCompile(`/{2,}`)

View file

@ -32,7 +32,7 @@ func TestGlobToRegexp(t *testing.T) {
{`***`, `(^|/)`, `too many stars`},
{`ab]c`, `(^|/)`, `mismatched ']'`},
{`ab[c`, `(^|/)`, `mismatched '[' and ']'`},
{`ab{{cd`, `(^|/)`, `can't nest`},
{`ab{x{cd`, `(^|/)`, `can't nest`},
{`ab{}}cd`, `(^|/)`, `mismatched '{' and '}'`},
{`ab}c`, `(^|/)`, `mismatched '{' and '}'`},
{`ab{c`, `(^|/)`, `mismatched '{' and '}'`},
@ -40,16 +40,24 @@ func TestGlobToRegexp(t *testing.T) {
{`[a--b]`, `(^|/)`, `bad glob pattern`},
{`a\*b`, `(^|/)a\*b$`, ``},
{`a\\b`, `(^|/)a\\b$`, ``},
{`a{{.*}}b`, `(^|/)a(.*)b$`, ``},
{`a{{.*}`, `(^|/)a(.*)b$`, `mismatched '{{' and '}}'`},
{`{{regexp}}`, `(^|/)(regexp)$`, ``},
{`\{{{regexp}}`, `(^|/)\{(regexp)$`, ``},
{`/{{regexp}}`, `^(regexp)$`, ``},
{`/{{\d{8}}}`, `^(\d{8})$`, ``},
{`/{{\}}}`, `^(\})$`, ``},
{`{{(?i)regexp}}`, `(^|/)((?i)regexp)$`, ``},
} {
for _, ignoreCase := range []bool{false, true} {
gotRe, err := GlobToRegexp(test.in, ignoreCase)
if test.error == "" {
require.NoError(t, err, test.in)
prefix := ""
if ignoreCase {
prefix = "(?i)"
}
got := gotRe.String()
require.NoError(t, err, test.in)
assert.Equal(t, prefix+test.want, got, test.in)
} else {
require.Error(t, err, test.in)
@ -84,6 +92,7 @@ func TestGlobToDirGlobs(t *testing.T) {
{`/a/{jpg,png,gif}/*.{jpg,png,gif}`, []string{"/a/{jpg,png,gif}/", "/a/", "/"}},
{`a/{a,a*b,a**c}/d/`, []string{"/**"}},
{`/a/{a,a*b,a/c,d}/d/`, []string{"/**"}},
{`/a/{{.*}}/d/`, []string{"/**"}},
{`**`, []string{"**/"}},
{`a**`, []string{"a**/"}},
{`a**b`, []string{"a**/"}},