fs: stop normalizing file names but do a normalized compare in the sync
This works by using a transform function to transform file names when doing a compare when matching file names in a directory. rclone now UTF-8 normalizes the file names and does a case insensitive compare if the destination remote is case insensitive. This deprecates the --local-no-unicode-normalization flag. Fixes #1477
This commit is contained in:
parent
a8e41f081c
commit
4ac9a65049
5 changed files with 194 additions and 39 deletions
|
@ -122,15 +122,9 @@ $ rclone -L ls /tmp/a
|
|||
|
||||
#### --local-no-unicode-normalization ####
|
||||
|
||||
By default rclone normalizes (NFC) the unicode representation of filenames and
|
||||
directories. This flag disables that normalization and uses the same
|
||||
representation as the local filesystem.
|
||||
|
||||
This can be useful if you need to retain the local unicode representation and
|
||||
you are using a cloud provider which supports unnormalized names (e.g. S3 or ACD).
|
||||
|
||||
This should also work with any provider if you are using crypt and have file
|
||||
name encryption (the default) or obfuscation turned on.
|
||||
This flag is deprecated now. Rclone no longer normalizes unicode file
|
||||
names, but it compares them with unicode normalization in the sync
|
||||
routine instead.
|
||||
|
||||
#### --one-file-system, -x ####
|
||||
|
||||
|
|
115
fs/march.go
115
fs/march.go
|
@ -1,9 +1,13 @@
|
|||
package fs
|
||||
|
||||
import (
|
||||
"path"
|
||||
"sort"
|
||||
"strings"
|
||||
"sync"
|
||||
|
||||
"golang.org/x/net/context"
|
||||
"golang.org/x/text/unicode/norm"
|
||||
)
|
||||
|
||||
// march traverses two Fs simultaneously, calling walker for each match
|
||||
|
@ -17,6 +21,7 @@ type march struct {
|
|||
// internal state
|
||||
srcListDir listDirFn // function to call to list a directory in the src
|
||||
dstListDir listDirFn // function to call to list a directory in the dst
|
||||
transforms []matchTransformFn
|
||||
}
|
||||
|
||||
// marcher is called on each match
|
||||
|
@ -40,6 +45,18 @@ func newMarch(ctx context.Context, fdst, fsrc Fs, dir string, callback marcher)
|
|||
}
|
||||
m.srcListDir = m.makeListDir(fsrc, false)
|
||||
m.dstListDir = m.makeListDir(fdst, Config.Filter.DeleteExcluded)
|
||||
// Now create the matching transform
|
||||
// ..normalise the UTF8 first
|
||||
m.transforms = append(m.transforms, norm.NFC.String)
|
||||
// ..if destination is caseInsensitive then make it lower case
|
||||
// case Insensitive | src | dst | lower case compare |
|
||||
// | No | No | No |
|
||||
// | Yes | No | No |
|
||||
// | No | Yes | Yes |
|
||||
// | Yes | Yes | Yes |
|
||||
if fdst.Features().CaseInsensitive {
|
||||
m.transforms = append(m.transforms, strings.ToLower)
|
||||
}
|
||||
return m
|
||||
}
|
||||
|
||||
|
@ -156,58 +173,122 @@ func (m *march) aborting() bool {
|
|||
return false
|
||||
}
|
||||
|
||||
// matchEntry is an entry plus transformed name
|
||||
type matchEntry struct {
|
||||
entry DirEntry
|
||||
leaf string
|
||||
name string
|
||||
}
|
||||
|
||||
// matchEntries contains many matchEntry~s
|
||||
type matchEntries []matchEntry
|
||||
|
||||
// Len is part of sort.Interface.
|
||||
func (es matchEntries) Len() int { return len(es) }
|
||||
|
||||
// Swap is part of sort.Interface.
|
||||
func (es matchEntries) Swap(i, j int) { es[i], es[j] = es[j], es[i] }
|
||||
|
||||
// Less is part of sort.Interface.
|
||||
//
|
||||
// Compare in order (name, leaf, remote)
|
||||
func (es matchEntries) Less(i, j int) bool {
|
||||
ei, ej := &es[i], &es[j]
|
||||
if ei.name == ej.name {
|
||||
if ei.leaf == ej.leaf {
|
||||
return ei.entry.Remote() < ej.entry.Remote()
|
||||
}
|
||||
return ei.leaf < ej.leaf
|
||||
}
|
||||
return ei.name < ej.name
|
||||
}
|
||||
|
||||
// Sort the directory entries by (name, leaf, remote)
|
||||
//
|
||||
// We use a stable sort here just in case there are
|
||||
// duplicates. Assuming the remote delivers the entries in a
|
||||
// consistent order, this will give the best user experience
|
||||
// in syncing as it will use the first entry for the sync
|
||||
// comparison.
|
||||
func (es matchEntries) sort() {
|
||||
sort.Stable(es)
|
||||
}
|
||||
|
||||
// make a matchEntries from a newMatch entries
|
||||
func newMatchEntries(entries DirEntries, transforms []matchTransformFn) matchEntries {
|
||||
es := make(matchEntries, len(entries))
|
||||
for i := range es {
|
||||
es[i].entry = entries[i]
|
||||
name := path.Base(entries[i].Remote())
|
||||
es[i].leaf = name
|
||||
for _, transform := range transforms {
|
||||
name = transform(name)
|
||||
}
|
||||
es[i].name = name
|
||||
}
|
||||
es.sort()
|
||||
return es
|
||||
}
|
||||
|
||||
// matchPair is a matched pair of direntries returned by matchListings
|
||||
type matchPair struct {
|
||||
src, dst DirEntry
|
||||
}
|
||||
|
||||
// Process the two sorted listings, matching up the items in the two
|
||||
// sorted slices
|
||||
// matchTransformFn converts a name into a form which is used for
|
||||
// comparison in matchListings.
|
||||
type matchTransformFn func(name string) string
|
||||
|
||||
// Process the two listings, matching up the items in the two slices
|
||||
// using the transform function on each name first.
|
||||
//
|
||||
// Into srcOnly go Entries which only exist in the srcList
|
||||
// Into dstOnly go Entries which only exist in the dstList
|
||||
// Into matches go matchPair's of src and dst which have the same name
|
||||
//
|
||||
// This checks for duplicates and checks the list is sorted.
|
||||
func matchListings(srcList, dstList DirEntries) (srcOnly DirEntries, dstOnly DirEntries, matches []matchPair) {
|
||||
func matchListings(srcListEntries, dstListEntries DirEntries, transforms []matchTransformFn) (srcOnly DirEntries, dstOnly DirEntries, matches []matchPair) {
|
||||
srcList := newMatchEntries(srcListEntries, transforms)
|
||||
dstList := newMatchEntries(dstListEntries, transforms)
|
||||
for iSrc, iDst := 0, 0; ; iSrc, iDst = iSrc+1, iDst+1 {
|
||||
var src, dst DirEntry
|
||||
var srcRemote, dstRemote string
|
||||
var srcName, dstName string
|
||||
if iSrc < len(srcList) {
|
||||
src = srcList[iSrc]
|
||||
srcRemote = src.Remote()
|
||||
src = srcList[iSrc].entry
|
||||
srcName = srcList[iSrc].name
|
||||
}
|
||||
if iDst < len(dstList) {
|
||||
dst = dstList[iDst]
|
||||
dstRemote = dst.Remote()
|
||||
dst = dstList[iDst].entry
|
||||
dstName = dstList[iDst].name
|
||||
}
|
||||
if src == nil && dst == nil {
|
||||
break
|
||||
}
|
||||
if src != nil && iSrc > 0 {
|
||||
prev := srcList[iSrc-1].Remote()
|
||||
if srcRemote == prev {
|
||||
prev := srcList[iSrc-1].name
|
||||
if srcName == prev {
|
||||
Logf(src, "Duplicate %s found in source - ignoring", DirEntryType(src))
|
||||
src = nil // ignore the src
|
||||
} else if srcRemote < prev {
|
||||
} else if srcName < prev {
|
||||
Errorf(src, "Out of order listing in source")
|
||||
src = nil // ignore the src
|
||||
}
|
||||
}
|
||||
if dst != nil && iDst > 0 {
|
||||
prev := dstList[iDst-1].Remote()
|
||||
if dstRemote == prev {
|
||||
prev := dstList[iDst-1].name
|
||||
if dstName == prev {
|
||||
Logf(dst, "Duplicate %s found in destination - ignoring", DirEntryType(dst))
|
||||
dst = nil // ignore the dst
|
||||
} else if dstRemote < prev {
|
||||
} else if dstName < prev {
|
||||
Errorf(dst, "Out of order listing in destination")
|
||||
dst = nil // ignore the dst
|
||||
}
|
||||
}
|
||||
if src != nil && dst != nil {
|
||||
if srcRemote < dstRemote {
|
||||
if srcName < dstName {
|
||||
dst = nil
|
||||
iDst-- // retry the dst
|
||||
} else if srcRemote > dstRemote {
|
||||
} else if srcName > dstName {
|
||||
src = nil
|
||||
iSrc-- // retry the src
|
||||
}
|
||||
|
@ -271,7 +352,7 @@ func (m *march) processJob(job listDirJob) (jobs []listDirJob) {
|
|||
}
|
||||
|
||||
// Work out what to do and do it
|
||||
srcOnly, dstOnly, matches := matchListings(srcList, dstList)
|
||||
srcOnly, dstOnly, matches := matchListings(srcList, dstList, m.transforms)
|
||||
for _, src := range srcOnly {
|
||||
if m.aborting() {
|
||||
return nil
|
||||
|
|
|
@ -3,25 +3,53 @@
|
|||
package fs
|
||||
|
||||
import (
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
)
|
||||
|
||||
func TestNewMatchEntries(t *testing.T) {
|
||||
var (
|
||||
a = mockObject("path/a")
|
||||
A = mockObject("path/A")
|
||||
B = mockObject("path/B")
|
||||
c = mockObject("path/c")
|
||||
)
|
||||
|
||||
es := newMatchEntries(DirEntries{a, A, B, c}, nil)
|
||||
assert.Equal(t, es, matchEntries{
|
||||
{name: "A", leaf: "A", entry: A},
|
||||
{name: "B", leaf: "B", entry: B},
|
||||
{name: "a", leaf: "a", entry: a},
|
||||
{name: "c", leaf: "c", entry: c},
|
||||
})
|
||||
|
||||
es = newMatchEntries(DirEntries{a, A, B, c}, []matchTransformFn{strings.ToLower})
|
||||
assert.Equal(t, es, matchEntries{
|
||||
{name: "a", leaf: "A", entry: A},
|
||||
{name: "a", leaf: "a", entry: a},
|
||||
{name: "b", leaf: "B", entry: B},
|
||||
{name: "c", leaf: "c", entry: c},
|
||||
})
|
||||
}
|
||||
|
||||
func TestMatchListings(t *testing.T) {
|
||||
var (
|
||||
a = mockObject("a")
|
||||
A = mockObject("A")
|
||||
b = mockObject("b")
|
||||
c = mockObject("c")
|
||||
d = mockObject("d")
|
||||
)
|
||||
|
||||
for _, test := range []struct {
|
||||
what string
|
||||
input DirEntries // pairs of input src, dst
|
||||
srcOnly DirEntries
|
||||
dstOnly DirEntries
|
||||
matches []matchPair // pairs of output
|
||||
what string
|
||||
input DirEntries // pairs of input src, dst
|
||||
srcOnly DirEntries
|
||||
dstOnly DirEntries
|
||||
matches []matchPair // pairs of output
|
||||
transforms []matchTransformFn
|
||||
}{
|
||||
{
|
||||
what: "only src or dst",
|
||||
|
@ -92,6 +120,28 @@ func TestMatchListings(t *testing.T) {
|
|||
},
|
||||
},
|
||||
{
|
||||
what: "Case insensitive duplicate - no transform",
|
||||
input: DirEntries{
|
||||
a, a,
|
||||
A, A,
|
||||
},
|
||||
matches: []matchPair{
|
||||
{A, A},
|
||||
{a, a},
|
||||
},
|
||||
},
|
||||
{
|
||||
what: "Case insensitive duplicate - transform to lower case",
|
||||
input: DirEntries{
|
||||
a, a,
|
||||
A, A,
|
||||
},
|
||||
matches: []matchPair{
|
||||
{A, A},
|
||||
},
|
||||
transforms: []matchTransformFn{strings.ToLower},
|
||||
},
|
||||
/*{
|
||||
what: "Out of order",
|
||||
input: DirEntries{
|
||||
c, nil,
|
||||
|
@ -104,7 +154,7 @@ func TestMatchListings(t *testing.T) {
|
|||
dstOnly: DirEntries{
|
||||
b,
|
||||
},
|
||||
},
|
||||
},*/
|
||||
} {
|
||||
var srcList, dstList DirEntries
|
||||
for i := 0; i < len(test.input); i += 2 {
|
||||
|
@ -116,12 +166,12 @@ func TestMatchListings(t *testing.T) {
|
|||
dstList = append(dstList, dst)
|
||||
}
|
||||
}
|
||||
srcOnly, dstOnly, matches := matchListings(srcList, dstList)
|
||||
srcOnly, dstOnly, matches := matchListings(srcList, dstList, test.transforms)
|
||||
assert.Equal(t, test.srcOnly, srcOnly, test.what)
|
||||
assert.Equal(t, test.dstOnly, dstOnly, test.what)
|
||||
assert.Equal(t, test.matches, matches, test.what)
|
||||
// now swap src and dst
|
||||
dstOnly, srcOnly, matches = matchListings(dstList, srcList)
|
||||
dstOnly, srcOnly, matches = matchListings(dstList, srcList, test.transforms)
|
||||
assert.Equal(t, test.srcOnly, srcOnly, test.what)
|
||||
assert.Equal(t, test.dstOnly, dstOnly, test.what)
|
||||
assert.Equal(t, test.matches, matches, test.what)
|
||||
|
|
|
@ -10,6 +10,7 @@ import (
|
|||
"github.com/ncw/rclone/fstest"
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
"golang.org/x/text/unicode/norm"
|
||||
)
|
||||
|
||||
// Check dry run is working
|
||||
|
@ -961,3 +962,32 @@ func testSyncBackupDir(t *testing.T, suffix string) {
|
|||
}
|
||||
func TestSyncBackupDir(t *testing.T) { testSyncBackupDir(t, "") }
|
||||
func TestSyncBackupDirWithSuffix(t *testing.T) { testSyncBackupDir(t, ".bak") }
|
||||
|
||||
// Check we can sync two files with differing UTF-8 representations
|
||||
func TestSyncUTFNorm(t *testing.T) {
|
||||
r := NewRun(t)
|
||||
defer r.Finalise()
|
||||
|
||||
// Two strings with different unicode normalization (from OS X)
|
||||
Encoding1 := "Testêé"
|
||||
Encoding2 := "Testêé"
|
||||
assert.NotEqual(t, Encoding1, Encoding2)
|
||||
assert.Equal(t, norm.NFC.String(Encoding1), norm.NFC.String(Encoding2))
|
||||
|
||||
file1 := r.WriteFile(Encoding1, "This is a test", t1)
|
||||
fstest.CheckItems(t, r.flocal, file1)
|
||||
|
||||
file2 := r.WriteObject(Encoding2, "This is a old test", t2)
|
||||
fstest.CheckItems(t, r.fremote, file2)
|
||||
|
||||
fs.Stats.ResetCounters()
|
||||
err := fs.Sync(r.fremote, r.flocal)
|
||||
require.NoError(t, err)
|
||||
|
||||
// We should have transferred exactly one file, but kept the
|
||||
// normalized state of the file.
|
||||
assert.Equal(t, int64(1), fs.Stats.GetTransfers())
|
||||
fstest.CheckItems(t, r.flocal, file1)
|
||||
file1.Path = file2.Path
|
||||
fstest.CheckItems(t, r.fremote, file1)
|
||||
}
|
||||
|
|
|
@ -15,10 +15,9 @@ import (
|
|||
"time"
|
||||
"unicode/utf8"
|
||||
|
||||
"golang.org/x/text/unicode/norm"
|
||||
|
||||
"github.com/ncw/rclone/fs"
|
||||
"github.com/pkg/errors"
|
||||
"google.golang.org/appengine/log"
|
||||
)
|
||||
|
||||
var (
|
||||
|
@ -82,6 +81,10 @@ type Object struct {
|
|||
func NewFs(name, root string) (fs.Fs, error) {
|
||||
var err error
|
||||
|
||||
if *noUTFNorm {
|
||||
log.Errorf(nil, "The --local-no-unicode-normalization flag is deprecated and will be removed")
|
||||
}
|
||||
|
||||
nounc := fs.ConfigFileGet(name, "nounc")
|
||||
f := &Fs{
|
||||
name: name,
|
||||
|
@ -273,9 +276,6 @@ func (f *Fs) cleanRemote(name string) string {
|
|||
f.wmu.Unlock()
|
||||
name = string([]rune(name))
|
||||
}
|
||||
if !*noUTFNorm {
|
||||
name = norm.NFC.String(name)
|
||||
}
|
||||
name = filepath.ToSlash(name)
|
||||
return name
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue