fs: stop normalizing file names but do a normalized compare in the sync

This works by using a transform function to transform file names when
doing a compare when matching file names in a directory.  rclone now
UTF-8 normalizes the file names and does a case insensitive compare if
the destination remote is case insensitive.

This deprecates the --local-no-unicode-normalization flag.

Fixes #1477
This commit is contained in:
Nick Craig-Wood 2017-09-08 16:19:41 +01:00
parent a8e41f081c
commit 4ac9a65049
5 changed files with 194 additions and 39 deletions

View file

@ -122,15 +122,9 @@ $ rclone -L ls /tmp/a
#### --local-no-unicode-normalization ####
By default rclone normalizes (NFC) the unicode representation of filenames and
directories. This flag disables that normalization and uses the same
representation as the local filesystem.
This can be useful if you need to retain the local unicode representation and
you are using a cloud provider which supports unnormalized names (e.g. S3 or ACD).
This should also work with any provider if you are using crypt and have file
name encryption (the default) or obfuscation turned on.
This flag is deprecated now. Rclone no longer normalizes unicode file
names, but it compares them with unicode normalization in the sync
routine instead.
#### --one-file-system, -x ####

View file

@ -1,9 +1,13 @@
package fs
import (
"path"
"sort"
"strings"
"sync"
"golang.org/x/net/context"
"golang.org/x/text/unicode/norm"
)
// march traverses two Fs simultaneously, calling walker for each match
@ -17,6 +21,7 @@ type march struct {
// internal state
srcListDir listDirFn // function to call to list a directory in the src
dstListDir listDirFn // function to call to list a directory in the dst
transforms []matchTransformFn
}
// marcher is called on each match
@ -40,6 +45,18 @@ func newMarch(ctx context.Context, fdst, fsrc Fs, dir string, callback marcher)
}
m.srcListDir = m.makeListDir(fsrc, false)
m.dstListDir = m.makeListDir(fdst, Config.Filter.DeleteExcluded)
// Now create the matching transform
// ..normalise the UTF8 first
m.transforms = append(m.transforms, norm.NFC.String)
// ..if destination is caseInsensitive then make it lower case
// case Insensitive | src | dst | lower case compare |
// | No | No | No |
// | Yes | No | No |
// | No | Yes | Yes |
// | Yes | Yes | Yes |
if fdst.Features().CaseInsensitive {
m.transforms = append(m.transforms, strings.ToLower)
}
return m
}
@ -156,58 +173,122 @@ func (m *march) aborting() bool {
return false
}
// matchEntry is an entry plus transformed name
type matchEntry struct {
entry DirEntry
leaf string
name string
}
// matchEntries contains many matchEntry~s
type matchEntries []matchEntry
// Len is part of sort.Interface.
func (es matchEntries) Len() int { return len(es) }
// Swap is part of sort.Interface.
func (es matchEntries) Swap(i, j int) { es[i], es[j] = es[j], es[i] }
// Less is part of sort.Interface.
//
// Compare in order (name, leaf, remote)
func (es matchEntries) Less(i, j int) bool {
ei, ej := &es[i], &es[j]
if ei.name == ej.name {
if ei.leaf == ej.leaf {
return ei.entry.Remote() < ej.entry.Remote()
}
return ei.leaf < ej.leaf
}
return ei.name < ej.name
}
// Sort the directory entries by (name, leaf, remote)
//
// We use a stable sort here just in case there are
// duplicates. Assuming the remote delivers the entries in a
// consistent order, this will give the best user experience
// in syncing as it will use the first entry for the sync
// comparison.
func (es matchEntries) sort() {
sort.Stable(es)
}
// make a matchEntries from a newMatch entries
func newMatchEntries(entries DirEntries, transforms []matchTransformFn) matchEntries {
es := make(matchEntries, len(entries))
for i := range es {
es[i].entry = entries[i]
name := path.Base(entries[i].Remote())
es[i].leaf = name
for _, transform := range transforms {
name = transform(name)
}
es[i].name = name
}
es.sort()
return es
}
// matchPair is a matched pair of direntries returned by matchListings
type matchPair struct {
src, dst DirEntry
}
// Process the two sorted listings, matching up the items in the two
// sorted slices
// matchTransformFn converts a name into a form which is used for
// comparison in matchListings.
type matchTransformFn func(name string) string
// Process the two listings, matching up the items in the two slices
// using the transform function on each name first.
//
// Into srcOnly go Entries which only exist in the srcList
// Into dstOnly go Entries which only exist in the dstList
// Into matches go matchPair's of src and dst which have the same name
//
// This checks for duplicates and checks the list is sorted.
func matchListings(srcList, dstList DirEntries) (srcOnly DirEntries, dstOnly DirEntries, matches []matchPair) {
func matchListings(srcListEntries, dstListEntries DirEntries, transforms []matchTransformFn) (srcOnly DirEntries, dstOnly DirEntries, matches []matchPair) {
srcList := newMatchEntries(srcListEntries, transforms)
dstList := newMatchEntries(dstListEntries, transforms)
for iSrc, iDst := 0, 0; ; iSrc, iDst = iSrc+1, iDst+1 {
var src, dst DirEntry
var srcRemote, dstRemote string
var srcName, dstName string
if iSrc < len(srcList) {
src = srcList[iSrc]
srcRemote = src.Remote()
src = srcList[iSrc].entry
srcName = srcList[iSrc].name
}
if iDst < len(dstList) {
dst = dstList[iDst]
dstRemote = dst.Remote()
dst = dstList[iDst].entry
dstName = dstList[iDst].name
}
if src == nil && dst == nil {
break
}
if src != nil && iSrc > 0 {
prev := srcList[iSrc-1].Remote()
if srcRemote == prev {
prev := srcList[iSrc-1].name
if srcName == prev {
Logf(src, "Duplicate %s found in source - ignoring", DirEntryType(src))
src = nil // ignore the src
} else if srcRemote < prev {
} else if srcName < prev {
Errorf(src, "Out of order listing in source")
src = nil // ignore the src
}
}
if dst != nil && iDst > 0 {
prev := dstList[iDst-1].Remote()
if dstRemote == prev {
prev := dstList[iDst-1].name
if dstName == prev {
Logf(dst, "Duplicate %s found in destination - ignoring", DirEntryType(dst))
dst = nil // ignore the dst
} else if dstRemote < prev {
} else if dstName < prev {
Errorf(dst, "Out of order listing in destination")
dst = nil // ignore the dst
}
}
if src != nil && dst != nil {
if srcRemote < dstRemote {
if srcName < dstName {
dst = nil
iDst-- // retry the dst
} else if srcRemote > dstRemote {
} else if srcName > dstName {
src = nil
iSrc-- // retry the src
}
@ -271,7 +352,7 @@ func (m *march) processJob(job listDirJob) (jobs []listDirJob) {
}
// Work out what to do and do it
srcOnly, dstOnly, matches := matchListings(srcList, dstList)
srcOnly, dstOnly, matches := matchListings(srcList, dstList, m.transforms)
for _, src := range srcOnly {
if m.aborting() {
return nil

View file

@ -3,25 +3,53 @@
package fs
import (
"strings"
"testing"
"github.com/stretchr/testify/assert"
)
func TestNewMatchEntries(t *testing.T) {
var (
a = mockObject("path/a")
A = mockObject("path/A")
B = mockObject("path/B")
c = mockObject("path/c")
)
es := newMatchEntries(DirEntries{a, A, B, c}, nil)
assert.Equal(t, es, matchEntries{
{name: "A", leaf: "A", entry: A},
{name: "B", leaf: "B", entry: B},
{name: "a", leaf: "a", entry: a},
{name: "c", leaf: "c", entry: c},
})
es = newMatchEntries(DirEntries{a, A, B, c}, []matchTransformFn{strings.ToLower})
assert.Equal(t, es, matchEntries{
{name: "a", leaf: "A", entry: A},
{name: "a", leaf: "a", entry: a},
{name: "b", leaf: "B", entry: B},
{name: "c", leaf: "c", entry: c},
})
}
func TestMatchListings(t *testing.T) {
var (
a = mockObject("a")
A = mockObject("A")
b = mockObject("b")
c = mockObject("c")
d = mockObject("d")
)
for _, test := range []struct {
what string
input DirEntries // pairs of input src, dst
srcOnly DirEntries
dstOnly DirEntries
matches []matchPair // pairs of output
what string
input DirEntries // pairs of input src, dst
srcOnly DirEntries
dstOnly DirEntries
matches []matchPair // pairs of output
transforms []matchTransformFn
}{
{
what: "only src or dst",
@ -92,6 +120,28 @@ func TestMatchListings(t *testing.T) {
},
},
{
what: "Case insensitive duplicate - no transform",
input: DirEntries{
a, a,
A, A,
},
matches: []matchPair{
{A, A},
{a, a},
},
},
{
what: "Case insensitive duplicate - transform to lower case",
input: DirEntries{
a, a,
A, A,
},
matches: []matchPair{
{A, A},
},
transforms: []matchTransformFn{strings.ToLower},
},
/*{
what: "Out of order",
input: DirEntries{
c, nil,
@ -104,7 +154,7 @@ func TestMatchListings(t *testing.T) {
dstOnly: DirEntries{
b,
},
},
},*/
} {
var srcList, dstList DirEntries
for i := 0; i < len(test.input); i += 2 {
@ -116,12 +166,12 @@ func TestMatchListings(t *testing.T) {
dstList = append(dstList, dst)
}
}
srcOnly, dstOnly, matches := matchListings(srcList, dstList)
srcOnly, dstOnly, matches := matchListings(srcList, dstList, test.transforms)
assert.Equal(t, test.srcOnly, srcOnly, test.what)
assert.Equal(t, test.dstOnly, dstOnly, test.what)
assert.Equal(t, test.matches, matches, test.what)
// now swap src and dst
dstOnly, srcOnly, matches = matchListings(dstList, srcList)
dstOnly, srcOnly, matches = matchListings(dstList, srcList, test.transforms)
assert.Equal(t, test.srcOnly, srcOnly, test.what)
assert.Equal(t, test.dstOnly, dstOnly, test.what)
assert.Equal(t, test.matches, matches, test.what)

View file

@ -10,6 +10,7 @@ import (
"github.com/ncw/rclone/fstest"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"golang.org/x/text/unicode/norm"
)
// Check dry run is working
@ -961,3 +962,32 @@ func testSyncBackupDir(t *testing.T, suffix string) {
}
func TestSyncBackupDir(t *testing.T) { testSyncBackupDir(t, "") }
func TestSyncBackupDirWithSuffix(t *testing.T) { testSyncBackupDir(t, ".bak") }
// Check we can sync two files with differing UTF-8 representations
func TestSyncUTFNorm(t *testing.T) {
r := NewRun(t)
defer r.Finalise()
// Two strings with different unicode normalization (from OS X)
Encoding1 := "Testêé"
Encoding2 := "Testêé"
assert.NotEqual(t, Encoding1, Encoding2)
assert.Equal(t, norm.NFC.String(Encoding1), norm.NFC.String(Encoding2))
file1 := r.WriteFile(Encoding1, "This is a test", t1)
fstest.CheckItems(t, r.flocal, file1)
file2 := r.WriteObject(Encoding2, "This is a old test", t2)
fstest.CheckItems(t, r.fremote, file2)
fs.Stats.ResetCounters()
err := fs.Sync(r.fremote, r.flocal)
require.NoError(t, err)
// We should have transferred exactly one file, but kept the
// normalized state of the file.
assert.Equal(t, int64(1), fs.Stats.GetTransfers())
fstest.CheckItems(t, r.flocal, file1)
file1.Path = file2.Path
fstest.CheckItems(t, r.fremote, file1)
}

View file

@ -15,10 +15,9 @@ import (
"time"
"unicode/utf8"
"golang.org/x/text/unicode/norm"
"github.com/ncw/rclone/fs"
"github.com/pkg/errors"
"google.golang.org/appengine/log"
)
var (
@ -82,6 +81,10 @@ type Object struct {
func NewFs(name, root string) (fs.Fs, error) {
var err error
if *noUTFNorm {
log.Errorf(nil, "The --local-no-unicode-normalization flag is deprecated and will be removed")
}
nounc := fs.ConfigFileGet(name, "nounc")
f := &Fs{
name: name,
@ -273,9 +276,6 @@ func (f *Fs) cleanRemote(name string) string {
f.wmu.Unlock()
name = string([]rune(name))
}
if !*noUTFNorm {
name = norm.NFC.String(name)
}
name = filepath.ToSlash(name)
return name
}