vfs: fix unicode normalization on macOS - fixes #7072

Before this change, the VFS layer did not properly handle unicode normalization,
which caused problems particularly for users of macOS. While attempts were made
to handle it with various `-o modules=iconv` combinations, this was an imperfect
solution, as no one combination allowed both NFC and NFD content to
simultaneously be both visible and editable via Finder.

After this change, the VFS supports `--no-unicode-normalization` (default `false`)
via the existing `--vfs-case-insensitive` logic, which is extended to apply to both
case insensitivity and unicode normalization form.

This change also adds an additional flag, `--vfs-block-norm-dupes`, to address a
probably rare but potentially possible scenario where a directory contains
multiple duplicate filenames after applying case and unicode normalization
settings. In such a scenario, this flag (disabled by default) hides the
duplicates. This comes with a performance tradeoff, as rclone will have to scan
the entire directory for duplicates when listing a directory. For this reason,
it is recommended to leave this disabled if not needed. However, macOS users may
wish to consider using it, as otherwise, if a remote directory contains both NFC
and NFD versions of the same filename, an odd situation will occur: both
versions of the file will be visible in the mount, and both will appear to be
editable, however, editing either version will actually result in only the NFD
version getting edited under the hood. `--vfs-block-norm-dupes` prevents this
confusion by detecting this scenario, hiding the duplicates, and logging an
error, similar to how this is handled in `rclone sync`.
This commit is contained in:
nielash 2024-02-05 02:58:11 -05:00 committed by Nick Craig-Wood
parent dcdbad3554
commit fd8faeb0e6
8 changed files with 115 additions and 26 deletions

View file

@ -116,13 +116,6 @@ func mountOptions(VFS *vfs.VFS, device string, mountpoint string, opt *mountlib.
for _, option := range opt.ExtraFlags {
options = append(options, option)
}
if runtime.GOOS == "darwin" {
if !findOption("modules=iconv", options) {
iconv := "modules=iconv,from_code=UTF-8,to_code=UTF-8-MAC"
options = append(options, "-o", iconv)
fs.Debugf(nil, "Adding \"-o %s\" for macOS", iconv)
}
}
return options
}

View file

@ -257,7 +257,12 @@ Mounting on macOS can be done either via [built-in NFS server](/commands/rclone_
FUSE driver utilizing a macOS kernel extension (kext). FUSE-T is an alternative FUSE system
which "mounts" via an NFSv4 local server.
## NFS mount
##### Unicode Normalization
It is highly recommended to keep the default of `--no-unicode-normalization=false`
for all `mount` and `serve` commands on macOS. For details, see [vfs-case-sensitivity](https://rclone.org/commands/rclone_mount/#vfs-case-sensitivity).
#### NFS mount
This method spins up an NFS server using [serve nfs](/commands/rclone_serve_nfs/) command and mounts
it to the specified mountpoint. If you run this in background mode using |--daemon|, you will need to
@ -294,15 +299,6 @@ As per the [FUSE-T wiki](https://github.com/macos-fuse-t/fuse-t/wiki#caveats):
This means that viewing files with various tools, notably macOS Finder, will cause rlcone
to update the modification time of the file. This may make rclone upload a full new copy
of the file.
##### Unicode Normalization
Rclone includes flags for unicode normalization with macFUSE that should be updated
for FUSE-T. See [this forum post](https://forum.rclone.org/t/some-unicode-forms-break-mount-on-macos-with-fuse-t/36403)
and [FUSE-T issue #16](https://github.com/macos-fuse-t/fuse-t/issues/16). The following
flag should be added to the `rclone mount` command.
-o modules=iconv,from_code=UTF-8,to_code=UTF-8
##### Read Only mounts

View file

@ -380,10 +380,19 @@ func CheckDownload(ctx context.Context, opt *CheckOpt) error {
// so that it matches behavior of Check (where it's handled by March)
func ApplyTransforms(ctx context.Context, s string) string {
ci := fs.GetConfig(ctx)
if !ci.NoUnicodeNormalization {
return ToNormal(s, !ci.NoUnicodeNormalization, ci.IgnoreCaseSync)
}
// ToNormal normalizes case and unicode form and returns the transformed string.
// It is similar to ApplyTransforms but does not use a context.
// If normUnicode == true, s will be transformed to NFC.
// If normCase == true, s will be transformed to lowercase.
// If both are true, both transformations will be performed.
func ToNormal(s string, normUnicode, normCase bool) string {
if normUnicode {
s = norm.NFC.String(s)
}
if ci.IgnoreCaseSync {
if normCase {
s = strings.ToLower(s)
}
return s

View file

@ -18,6 +18,7 @@ import (
"github.com/rclone/rclone/fs/operations"
"github.com/rclone/rclone/fs/walk"
"github.com/rclone/rclone/vfs/vfscommon"
"golang.org/x/text/unicode/norm"
)
// Dir represents a directory entry
@ -466,7 +467,6 @@ func (d *Dir) AddVirtual(leaf string, size int64, isDir bool) {
node = f
}
d.addObject(node)
}
// delObject removes an object from the directory
@ -514,6 +514,35 @@ func (d *Dir) _readDir() error {
return err
}
if d.vfs.Opt.BlockNormDupes { // do this only if requested, as it will have a performance hit
ci := fs.GetConfig(context.TODO())
// sort entries such that NFD comes before NFC of same name
sort.Slice(entries, func(i, j int) bool {
if entries[i] != entries[j] && fs.DirEntryType(entries[i]) == fs.DirEntryType(entries[j]) && norm.NFC.String(entries[i].Remote()) == norm.NFC.String(entries[j].Remote()) {
if norm.NFD.IsNormalString(entries[i].Remote()) && !norm.NFD.IsNormalString(entries[j].Remote()) {
return true
}
}
return entries.Less(i, j)
})
// detect dupes, remove them from the list and log an error
normalizedNames := make(map[string]struct{}, entries.Len())
filteredEntries := make(fs.DirEntries, 0)
for _, e := range entries {
normName := fmt.Sprintf("%s-%T", operations.ToNormal(e.Remote(), !ci.NoUnicodeNormalization, (ci.IgnoreCaseSync || d.vfs.Opt.CaseInsensitive)), e) // include type to track objects and dirs separately
_, found := normalizedNames[normName]
if found {
fs.Errorf(e.Remote(), "duplicate normalized names detected - skipping")
continue
}
normalizedNames[normName] = struct{}{}
filteredEntries = append(filteredEntries, e)
}
entries = filteredEntries
}
err = d._readDirFromEntries(entries, nil, time.Time{})
if err != nil {
return err
@ -767,15 +796,18 @@ func (d *Dir) stat(leaf string) (Node, error) {
}
item, ok := d.items[leaf]
if !ok && d.vfs.Opt.CaseInsensitive {
leafLower := strings.ToLower(leaf)
ci := fs.GetConfig(context.TODO())
normUnicode := !ci.NoUnicodeNormalization
normCase := ci.IgnoreCaseSync || d.vfs.Opt.CaseInsensitive
if !ok && (normUnicode || normCase) {
leafNormalized := operations.ToNormal(leaf, normUnicode, normCase) // this handles both case and unicode normalization
for name, node := range d.items {
if strings.ToLower(name) == leafLower {
if operations.ToNormal(name, normUnicode, normCase) == leafNormalized {
if ok {
// duplicate case insensitive match is an error
return nil, fmt.Errorf("duplicate filename %q detected with --vfs-case-insensitive set", leaf)
// duplicate normalized match is an error
return nil, fmt.Errorf("duplicate filename %q detected with case/unicode normalization settings", leaf)
}
// found a case insensitive match
// found a normalized match
ok = true
item = node
}

View file

@ -309,6 +309,28 @@ If the flag is not provided on the command line, then its default value depends
on the operating system where rclone runs: "true" on Windows and macOS, "false"
otherwise. If the flag is provided without a value, then it is "true".
The `--no-unicode-normalization` flag controls whether a similar "fixup" is
performed for filenames that differ but are [canonically
equivalent](https://en.wikipedia.org/wiki/Unicode_equivalence) with respect to
unicode. Unicode normalization can be particularly helpful for users of macOS,
which prefers form NFD instead of the NFC used by most other platforms. It is
therefore highly recommended to keep the default of `false` on macOS, to avoid
encoding compatibility issues.
In the (probably unlikely) event that a directory has multiple duplicate
filenames after applying case and unicode normalization, the `--vfs-block-norm-dupes`
flag allows hiding these duplicates. This comes with a performance tradeoff, as
rclone will have to scan the entire directory for duplicates when listing a
directory. For this reason, it is recommended to leave this disabled if not
needed. However, macOS users may wish to consider using it, as otherwise, if a
remote directory contains both NFC and NFD versions of the same filename, an odd
situation will occur: both versions of the file will be visible in the mount,
and both will appear to be editable, however, editing either version will
actually result in only the NFD version getting edited under the hood. `--vfs-block-
norm-dupes` prevents this confusion by detecting this scenario, hiding the
duplicates, and logging an error, similar to how this is handled in `rclone
sync`.
### VFS Disk Options
This flag allows you to manually set the statistics about the filing system.

View file

@ -5,10 +5,12 @@ import (
"os"
"testing"
"github.com/rclone/rclone/fs"
"github.com/rclone/rclone/fstest"
"github.com/rclone/rclone/vfs/vfscommon"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"golang.org/x/text/unicode/norm"
)
func TestCaseSensitivity(t *testing.T) {
@ -160,3 +162,36 @@ func assertFileAbsentVFS(t *testing.T, vfs *VFS, name string) {
assert.Error(t, err)
assert.Equal(t, err, ENOENT)
}
func TestUnicodeNormalization(t *testing.T) {
r := fstest.NewRun(t)
var (
nfc = norm.NFC.String(norm.NFD.String("測試_Русский___ě_áñ"))
nfd = norm.NFD.String(nfc)
both = "normal name with no special characters.txt"
)
// Create test files
ctx := context.Background()
file1 := r.WriteObject(ctx, both, "data1", t1)
file2 := r.WriteObject(ctx, nfc, "data2", t2)
r.CheckRemoteItems(t, file1, file2)
// Create VFS
opt := vfscommon.DefaultOpt
vfs := New(r.Fremote, &opt)
defer cleanupVFS(t, vfs)
// assert that both files are found under NFD-normalized names
assertFileDataVFS(t, vfs, norm.NFD.String(both), "data1")
assertFileDataVFS(t, vfs, nfd, "data2")
// change ci.NoUnicodeNormalization to true and verify that only file1 is found
ci := fs.GetConfig(ctx) // need to set the global config here as the *Dir methods don't take a ctx param
oldVal := ci.NoUnicodeNormalization
defer func() { fs.GetConfig(ctx).NoUnicodeNormalization = oldVal }() // restore the prior value after the test
ci.NoUnicodeNormalization = true
assertFileDataVFS(t, vfs, norm.NFD.String(both), "data1")
assertFileAbsentVFS(t, vfs, nfd)
}

View file

@ -30,6 +30,7 @@ type Options struct {
CacheMinFreeSpace fs.SizeSuffix
CachePollInterval time.Duration
CaseInsensitive bool
BlockNormDupes bool
WriteWait time.Duration // time to wait for in-sequence write
ReadWait time.Duration // time to wait for in-sequence read
WriteBack time.Duration // time to wait before writing back dirty files

View file

@ -35,6 +35,7 @@ func AddFlags(flagSet *pflag.FlagSet) {
flags.FVarP(flagSet, DirPerms, "dir-perms", "", "Directory permissions", "VFS")
flags.FVarP(flagSet, FilePerms, "file-perms", "", "File permissions", "VFS")
flags.BoolVarP(flagSet, &Opt.CaseInsensitive, "vfs-case-insensitive", "", Opt.CaseInsensitive, "If a file name not found, find a case insensitive match", "VFS")
flags.BoolVarP(flagSet, &Opt.BlockNormDupes, "vfs-block-norm-dupes", "", Opt.BlockNormDupes, "If duplicate filenames exist in the same directory (after normalization), log an error and hide the duplicates (may have a performance cost)", "VFS")
flags.DurationVarP(flagSet, &Opt.WriteWait, "vfs-write-wait", "", Opt.WriteWait, "Time to wait for in-sequence write before giving error", "VFS")
flags.DurationVarP(flagSet, &Opt.ReadWait, "vfs-read-wait", "", Opt.ReadWait, "Time to wait for in-sequence read before seeking", "VFS")
flags.DurationVarP(flagSet, &Opt.WriteBack, "vfs-write-back", "", Opt.WriteBack, "Time to writeback files after last use when using cache", "VFS")