filter: add {{ regexp }}
syntax to pattern matches - fixes #4074
There has been a desire from more advanced rclone users to have regexp filtering as well as the glob filtering. This patch adds regexp filtering using this syntax `{{ regexp }}` which is currently a syntax error, so is backwards compatibile. This means regexps can be used everywhere globs can be used, and that they also can be mixed with globs in the same pattern, eg `*.{{jpe?g}}`
This commit is contained in:
parent
74898bac3b
commit
268b808bf8
4 changed files with 166 additions and 13 deletions
|
@ -33,6 +33,9 @@ you expect. Instead use a `--filter...` flag.
|
|||
|
||||
### Pattern syntax
|
||||
|
||||
Here is a formal definition of the pattern syntax,
|
||||
[examples](#examples) are below.
|
||||
|
||||
Rclone matching rules follow a glob style:
|
||||
|
||||
* matches any sequence of non-separator (/) characters
|
||||
|
@ -42,8 +45,10 @@ Rclone matching rules follow a glob style:
|
|||
character class (must be non-empty)
|
||||
{ pattern-list }
|
||||
pattern alternatives
|
||||
{{ regexp }}
|
||||
regular expression to match
|
||||
c matches character c (c != *, **, ?, \, [, {, })
|
||||
\c matches reserved character c (c = *, **, ?, \, [, {, })
|
||||
\c matches reserved character c (c = *, **, ?, \, [, {, }) or character class
|
||||
|
||||
character-range:
|
||||
|
||||
|
@ -62,6 +67,10 @@ character classes (see [Go regular expression reference](https://golang.org/pkg/
|
|||
Perl character classes (e.g. \s, \S, \w, \W)
|
||||
ASCII character classes (e.g. [[:alnum:]], [[:alpha:]], [[:punct:]], [[:xdigit:]])
|
||||
|
||||
regexp for advanced users to insert a regular expression - see [below](#regexp) for more info:
|
||||
|
||||
Any re2 regular expression not containing `}}`
|
||||
|
||||
If the filter pattern starts with a `/` then it only matches
|
||||
at the top level of the directory tree,
|
||||
**relative to the root of the remote** (not necessarily the root
|
||||
|
@ -111,6 +120,75 @@ With `--ignore-case`
|
|||
potato - matches "potato"
|
||||
- matches "POTATO"
|
||||
|
||||
## Using regular expressions in filter patterns {#regexp}
|
||||
|
||||
The syntax of filter patterns is glob style matching (like `bash`
|
||||
uses) to make things easy for users. However this does not provide
|
||||
absolute control over the matching, so for advanced users rclone also
|
||||
provides a regular expression syntax.
|
||||
|
||||
The regular expressions used are as defined in the [Go regular
|
||||
expression reference](https://golang.org/pkg/regexp/syntax/). Regular
|
||||
expressions should be enclosed in `{{` `}}`. They will match only the
|
||||
last path segment if the glob doesn't start with `/` or the whole path
|
||||
name if it does.
|
||||
|
||||
Here is how the `{{regexp}}` is transformed into an full regular
|
||||
expression to match the entire path:
|
||||
|
||||
{{regexp}} becomes (^|/)(regexp)$
|
||||
/{{regexp}} becomes ^(regexp)$
|
||||
|
||||
Regexp syntax can be mixed with glob syntax, for example
|
||||
|
||||
*.{{jpe?g}} to match file.jpg, file.jpeg but not file.png
|
||||
|
||||
You can also use regexp flags - to set case insensitive, for example
|
||||
|
||||
*.{{(?i)jpg}} to match file.jpg, file.JPG but not file.png
|
||||
|
||||
Be careful with wildcards in regular expressions - you don't want them
|
||||
to match path separators normally. To match any file name starting
|
||||
with `start` and ending with `end` write
|
||||
|
||||
{{start[^/]*end\.jpg}}
|
||||
|
||||
Not
|
||||
|
||||
{{start.*end\.jpg}}
|
||||
|
||||
Which will match a directory called `start` with a file called
|
||||
`end.jpg` in it as the `.*` will match `/` characters.
|
||||
|
||||
Note that you can use `-vv --dump filters` to show the filter patterns
|
||||
in regexp format - rclone implements the glob patters by transforming
|
||||
them into regular expressions.
|
||||
|
||||
## Filter pattern examples {#examples}
|
||||
|
||||
| Description | Pattern | Matches | Does not match |
|
||||
| ----------- |-------- | ------- | -------------- |
|
||||
| Wildcard | `*.jpg` | `/file.jpg` | `/file.png` |
|
||||
| | | `/dir/file.jpg` | `/dir/file.png` |
|
||||
| Rooted | `/*.jpg` | `/file.jpg` | `/file.png` |
|
||||
| | | `/file2.jpg` | `/dir/file.jpg` |
|
||||
| Alternates | `*.{jpg,png}` | `/file.jpg` | `/file.gif` |
|
||||
| | | `/dir/file.gif` | `/dir/file.gif` |
|
||||
| Path Wildcard | `dir/**` | `/dir/anyfile` | `file.png` |
|
||||
| | | `/subdir/dir/subsubdir/anyfile` | `/subdir/file.png` |
|
||||
| Any Char | `*.t?t` | `/file.txt` | `/file.qxt` |
|
||||
| | | `/dir/file.tzt` | `/dir/file.png` |
|
||||
| Range | `*.[a-z]` | `/file.a` | `/file.0` |
|
||||
| | | `/dir/file.b` | `/dir/file.1` |
|
||||
| Escape | `*.\?\?\?` | `/file.???` | `/file.abc` |
|
||||
| | | `/dir/file.???` | `/dir/file.def` |
|
||||
| Class | `*.\d\d\d` | `/file.012` | `/file.abc` |
|
||||
| | | `/dir/file.345` | `/dir/file.def` |
|
||||
| Regexp | `*.{{jpe?g}}` | `/file.jpeg` | `/file.png` |
|
||||
| | | `/dir/file.jpg` | `/dir/file.jpeeg` |
|
||||
| Rooted Regexp | `/{{.*\.jpe?g}}` | `/file.jpeg` | `/file.png` |
|
||||
| | | `/file.jpg` | `/dir/file.jpg` |
|
||||
|
||||
## How filter rules are applied to files
|
||||
|
||||
Rclone path/file name filters are made up of one or more of the following flags:
|
||||
|
|
|
@ -503,6 +503,31 @@ func TestNewFilterMatchesIgnoreCase(t *testing.T) {
|
|||
assert.False(t, f.InActive())
|
||||
}
|
||||
|
||||
func TestNewFilterMatchesRegexp(t *testing.T) {
|
||||
f, err := NewFilter(nil)
|
||||
require.NoError(t, err)
|
||||
add := func(s string) {
|
||||
err := f.AddRule(s)
|
||||
require.NoError(t, err)
|
||||
}
|
||||
add(`+ /{{file\d+\.png}}`)
|
||||
add(`+ *.{{(?i)jpg}}`)
|
||||
add(`- *`)
|
||||
testInclude(t, f, []includeTest{
|
||||
{"file2.png", 100, 0, true},
|
||||
{"sub/file2.png", 100, 0, false},
|
||||
{"file123.png", 100, 0, true},
|
||||
{"File123.png", 100, 0, false},
|
||||
{"something.jpg", 100, 0, true},
|
||||
{"deep/path/something.JPG", 100, 0, true},
|
||||
{"something.gif", 100, 0, false},
|
||||
})
|
||||
testDirInclude(t, f, []includeDirTest{
|
||||
{"anything at all", true},
|
||||
})
|
||||
assert.False(t, f.InActive())
|
||||
}
|
||||
|
||||
func TestFilterAddDirRuleOrFileRule(t *testing.T) {
|
||||
for _, test := range []struct {
|
||||
included bool
|
||||
|
|
|
@ -19,7 +19,7 @@ func GlobToRegexp(glob string, ignoreCase bool) (*regexp.Regexp, error) {
|
|||
}
|
||||
if strings.HasPrefix(glob, "/") {
|
||||
glob = glob[1:]
|
||||
_, _ = re.WriteRune('^')
|
||||
_ = re.WriteByte('^')
|
||||
} else {
|
||||
_, _ = re.WriteString("(^|/)")
|
||||
}
|
||||
|
@ -38,15 +38,45 @@ func GlobToRegexp(glob string, ignoreCase bool) (*regexp.Regexp, error) {
|
|||
consecutiveStars = 0
|
||||
return nil
|
||||
}
|
||||
overwriteLastChar := func(c byte) {
|
||||
buf := re.Bytes()
|
||||
buf[len(buf)-1] = c
|
||||
}
|
||||
inBraces := false
|
||||
inBrackets := 0
|
||||
slashed := false
|
||||
inRegexp := false // inside {{ ... }}
|
||||
inRegexpEnd := false // have received }} waiting for more
|
||||
var next, last rune
|
||||
for _, c := range glob {
|
||||
next, last = c, next
|
||||
if slashed {
|
||||
_, _ = re.WriteRune(c)
|
||||
slashed = false
|
||||
continue
|
||||
}
|
||||
if inRegexpEnd {
|
||||
if c == '}' {
|
||||
// Regexp is ending with }} choose longest segment
|
||||
// Replace final ) with }
|
||||
overwriteLastChar('}')
|
||||
_ = re.WriteByte(')')
|
||||
continue
|
||||
} else {
|
||||
inRegexpEnd = false
|
||||
}
|
||||
}
|
||||
if inRegexp {
|
||||
if c == '}' && last == '}' {
|
||||
inRegexp = false
|
||||
inRegexpEnd = true
|
||||
// Replace final } with )
|
||||
overwriteLastChar(')')
|
||||
} else {
|
||||
_, _ = re.WriteRune(c)
|
||||
}
|
||||
continue
|
||||
}
|
||||
if c != '*' {
|
||||
err := insertStars()
|
||||
if err != nil {
|
||||
|
@ -78,24 +108,30 @@ func GlobToRegexp(glob string, ignoreCase bool) (*regexp.Regexp, error) {
|
|||
return nil, fmt.Errorf("mismatched ']' in glob %q", glob)
|
||||
case '{':
|
||||
if inBraces {
|
||||
if last == '{' {
|
||||
inRegexp = true
|
||||
inBraces = false
|
||||
} else {
|
||||
return nil, fmt.Errorf("can't nest '{' '}' in glob %q", glob)
|
||||
}
|
||||
} else {
|
||||
inBraces = true
|
||||
_, _ = re.WriteRune('(')
|
||||
_ = re.WriteByte('(')
|
||||
}
|
||||
case '}':
|
||||
if !inBraces {
|
||||
return nil, fmt.Errorf("mismatched '{' and '}' in glob %q", glob)
|
||||
}
|
||||
_, _ = re.WriteRune(')')
|
||||
_ = re.WriteByte(')')
|
||||
inBraces = false
|
||||
case ',':
|
||||
if inBraces {
|
||||
_, _ = re.WriteRune('|')
|
||||
_ = re.WriteByte('|')
|
||||
} else {
|
||||
_, _ = re.WriteRune(c)
|
||||
}
|
||||
case '.', '+', '(', ')', '|', '^', '$': // regexp meta characters not dealt with above
|
||||
_, _ = re.WriteRune('\\')
|
||||
_ = re.WriteByte('\\')
|
||||
_, _ = re.WriteRune(c)
|
||||
default:
|
||||
_, _ = re.WriteRune(c)
|
||||
|
@ -111,7 +147,10 @@ func GlobToRegexp(glob string, ignoreCase bool) (*regexp.Regexp, error) {
|
|||
if inBraces {
|
||||
return nil, fmt.Errorf("mismatched '{' and '}' in glob %q", glob)
|
||||
}
|
||||
_, _ = re.WriteRune('$')
|
||||
if inRegexp {
|
||||
return nil, fmt.Errorf("mismatched '{{' and '}}' in glob %q", glob)
|
||||
}
|
||||
_ = re.WriteByte('$')
|
||||
result, err := regexp.Compile(re.String())
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("bad glob pattern %q (regexp %q): %w", glob, re.String(), err)
|
||||
|
@ -120,8 +159,10 @@ func GlobToRegexp(glob string, ignoreCase bool) (*regexp.Regexp, error) {
|
|||
}
|
||||
|
||||
var (
|
||||
// Can't deal with / or ** in {}
|
||||
tooHardRe = regexp.MustCompile(`{[^{}]*(\*\*|/)[^{}]*}`)
|
||||
// Can't deal with
|
||||
// / or ** in {}
|
||||
// {{ regexp }}
|
||||
tooHardRe = regexp.MustCompile(`({[^{}]*(\*\*|/)[^{}]*})|\{\{|\}\}`)
|
||||
|
||||
// Squash all /
|
||||
squashSlash = regexp.MustCompile(`/{2,}`)
|
||||
|
|
|
@ -32,7 +32,7 @@ func TestGlobToRegexp(t *testing.T) {
|
|||
{`***`, `(^|/)`, `too many stars`},
|
||||
{`ab]c`, `(^|/)`, `mismatched ']'`},
|
||||
{`ab[c`, `(^|/)`, `mismatched '[' and ']'`},
|
||||
{`ab{{cd`, `(^|/)`, `can't nest`},
|
||||
{`ab{x{cd`, `(^|/)`, `can't nest`},
|
||||
{`ab{}}cd`, `(^|/)`, `mismatched '{' and '}'`},
|
||||
{`ab}c`, `(^|/)`, `mismatched '{' and '}'`},
|
||||
{`ab{c`, `(^|/)`, `mismatched '{' and '}'`},
|
||||
|
@ -40,16 +40,24 @@ func TestGlobToRegexp(t *testing.T) {
|
|||
{`[a--b]`, `(^|/)`, `bad glob pattern`},
|
||||
{`a\*b`, `(^|/)a\*b$`, ``},
|
||||
{`a\\b`, `(^|/)a\\b$`, ``},
|
||||
{`a{{.*}}b`, `(^|/)a(.*)b$`, ``},
|
||||
{`a{{.*}`, `(^|/)a(.*)b$`, `mismatched '{{' and '}}'`},
|
||||
{`{{regexp}}`, `(^|/)(regexp)$`, ``},
|
||||
{`\{{{regexp}}`, `(^|/)\{(regexp)$`, ``},
|
||||
{`/{{regexp}}`, `^(regexp)$`, ``},
|
||||
{`/{{\d{8}}}`, `^(\d{8})$`, ``},
|
||||
{`/{{\}}}`, `^(\})$`, ``},
|
||||
{`{{(?i)regexp}}`, `(^|/)((?i)regexp)$`, ``},
|
||||
} {
|
||||
for _, ignoreCase := range []bool{false, true} {
|
||||
gotRe, err := GlobToRegexp(test.in, ignoreCase)
|
||||
if test.error == "" {
|
||||
require.NoError(t, err, test.in)
|
||||
prefix := ""
|
||||
if ignoreCase {
|
||||
prefix = "(?i)"
|
||||
}
|
||||
got := gotRe.String()
|
||||
require.NoError(t, err, test.in)
|
||||
assert.Equal(t, prefix+test.want, got, test.in)
|
||||
} else {
|
||||
require.Error(t, err, test.in)
|
||||
|
@ -84,6 +92,7 @@ func TestGlobToDirGlobs(t *testing.T) {
|
|||
{`/a/{jpg,png,gif}/*.{jpg,png,gif}`, []string{"/a/{jpg,png,gif}/", "/a/", "/"}},
|
||||
{`a/{a,a*b,a**c}/d/`, []string{"/**"}},
|
||||
{`/a/{a,a*b,a/c,d}/d/`, []string{"/**"}},
|
||||
{`/a/{{.*}}/d/`, []string{"/**"}},
|
||||
{`**`, []string{"**/"}},
|
||||
{`a**`, []string{"a**/"}},
|
||||
{`a**b`, []string{"a**/"}},
|
||||
|
|
Loading…
Reference in a new issue