From 8ef551bf9c7f01c02e7457e4652bd7dba5310fed Mon Sep 17 00:00:00 2001 From: Nick Craig-Wood Date: Sat, 5 Mar 2016 16:10:51 +0000 Subject: [PATCH] Make dedupe remove identical copies without asking and add non interactive mode - fixes #338 * Now removes identical copies without asking * Now obeys `--dry-run` * Implement `--dedupe-mode` for non interactive running * `--dedupe-mode interactive` - interactive the default. * `--dedupe-mode skip` - removes identical files then skips anything left. * `--dedupe-mode first` - removes identical files then keeps the first one. * `--dedupe-mode newest` - removes identical files then keeps the newest one. * `--dedupe-mode oldest` - removes identical files then keeps the oldest one. * `--dedupe-mode rename` - removes identical files then renames the rest to be different. * Add tests which will only run on Google Drive. --- docs/content/docs.md | 84 ++++++++++++---- fs/config.go | 19 ++++ fs/operations.go | 220 +++++++++++++++++++++++++++++++----------- fs/operations_test.go | 157 ++++++++++++++++++++++++++++++ rclone.go | 2 +- 5 files changed, 405 insertions(+), 77 deletions(-) diff --git a/docs/content/docs.md b/docs/content/docs.md index 30cfd2dee..aeda5e5d5 100644 --- a/docs/content/docs.md +++ b/docs/content/docs.md @@ -194,17 +194,51 @@ don't match. It doesn't alter the source or destination. ### rclone dedupe remote:path ### -Interactively find duplicate files and offer to delete all but one or -rename them to be different. Only useful with Google Drive which can -have duplicate file names. +By default `dedup` interactively finds duplicate files and offers to +delete all but one or rename them to be different. Only useful with +Google Drive which can have duplicate file names. + +The `dedupe` command will delete all but one of any identical (same +md5sum) files it finds without confirmation. This means that for most +duplicated files the `dedupe` command will not be interactive. You +can use `--dry-run` to see what would happen without doing anything. + +Here is an example run. + +Before - with duplicates + +``` +$ rclone lsl drive:dupes + 6048320 2016-03-05 16:23:16.798000000 one.txt + 6048320 2016-03-05 16:23:11.775000000 one.txt + 564374 2016-03-05 16:23:06.731000000 one.txt + 6048320 2016-03-05 16:18:26.092000000 one.txt + 6048320 2016-03-05 16:22:46.185000000 two.txt + 1744073 2016-03-05 16:22:38.104000000 two.txt + 564374 2016-03-05 16:22:52.118000000 two.txt +``` + +Now the `dedupe` session ``` $ rclone dedupe drive:dupes -2016/01/31 14:13:11 Google drive root 'dupes': Looking for duplicates -two.txt: Found 3 duplicates - 1: 564374 bytes, 2016-01-31 14:07:22.159000000, md5sum 7594e7dc9fc28f727c42ee3e0749de81 - 2: 1744073 bytes, 2016-01-31 14:07:12.490000000, md5sum 851957f7fb6f0bc4ce76be966d336802 - 3: 6048320 bytes, 2016-01-31 14:07:02.111000000, md5sum 1eedaa9fe86fd4b8632e2ac549403b36 +2016/03/05 16:24:37 Google drive root 'dupes': Looking for duplicates using interactive mode. +one.txt: Found 4 duplicates - deleting identical copies +one.txt: Deleting 2/3 identical duplicates (md5sum "1eedaa9fe86fd4b8632e2ac549403b36") +one.txt: 2 duplicates remain + 1: 6048320 bytes, 2016-03-05 16:23:16.798000000, md5sum 1eedaa9fe86fd4b8632e2ac549403b36 + 2: 564374 bytes, 2016-03-05 16:23:06.731000000, md5sum 7594e7dc9fc28f727c42ee3e0749de81 +s) Skip and do nothing +k) Keep just one (choose which in next step) +r) Rename all to be different (by changing file.jpg to file-1.jpg) +s/k/r> k +Enter the number of the file to keep> 1 +one.txt: Deleted 1 extra copies +two.txt: Found 3 duplicates - deleting identical copies +two.txt: 3 duplicates remain + 1: 564374 bytes, 2016-03-05 16:22:52.118000000, md5sum 7594e7dc9fc28f727c42ee3e0749de81 + 2: 6048320 bytes, 2016-03-05 16:22:46.185000000, md5sum 1eedaa9fe86fd4b8632e2ac549403b36 + 3: 1744073 bytes, 2016-03-05 16:22:38.104000000, md5sum 851957f7fb6f0bc4ce76be966d336802 s) Skip and do nothing k) Keep just one (choose which in next step) r) Rename all to be different (by changing file.jpg to file-1.jpg) @@ -212,27 +246,31 @@ s/k/r> r two-1.txt: renamed from: two.txt two-2.txt: renamed from: two.txt two-3.txt: renamed from: two.txt -one.txt: Found 2 duplicates - 1: 6579 bytes, 2016-01-31 14:05:01.235000000, md5sum 2b76c776249409d925ae7ccd49aea59b - 2: 6579 bytes, 2016-01-31 12:50:30.318000000, md5sum 2b76c776249409d925ae7ccd49aea59b -s) Skip and do nothing -k) Keep just one (choose which in next step) -r) Rename all to be different (by changing file.jpg to file-1.jpg) -s/k/r> k -Enter the number of the file to keep> 2 -one.txt: Deleted 1 extra copies ``` The result being ``` $ rclone lsl drive:dupes - 564374 2016-01-31 14:07:22.159000000 two-1.txt - 1744073 2016-01-31 14:07:12.490000000 two-2.txt - 6048320 2016-01-31 14:07:02.111000000 two-3.txt - 6579 2016-01-31 12:50:30.318000000 one.txt + 6048320 2016-03-05 16:23:16.798000000 one.txt + 564374 2016-03-05 16:22:52.118000000 two-1.txt + 6048320 2016-03-05 16:22:46.185000000 two-2.txt + 1744073 2016-03-05 16:22:38.104000000 two-3.txt ``` +Dedupe can be run non interactively using the `--dedupe-mode` flag. + + * `--dedupe-mode interactive` - interactive as above. + * `--dedupe-mode skip` - removes identical files then skips anything left. + * `--dedupe-mode first` - removes identical files then keeps the first one. + * `--dedupe-mode newest` - removes identical files then keeps the newest one. + * `--dedupe-mode oldest` - removes identical files then keeps the oldest one. + * `--dedupe-mode rename` - removes identical files then renames the rest to be different. + +For example to rename all the identically named photos in your Google Photos directory, do + + rclone dedupe --dedupe-mode rename "drive:Google Photos" + ### rclone config ### Enter an interactive configuration session. @@ -342,6 +380,10 @@ The connection timeout is the amount of time rclone will wait for a connection to go through to a remote object storage system. It is `1m` by default. +### --dedupe-mode MODE ### + +Mode to run dedupe command in. One of `interactive`, `skip`, `first`, `newest`, `oldest`, `rename`. The default is `interactive`. See the dedupe command for more information as to what these options mean. + ### -n, --dry-run ### Do a trial run with no permanent changes. Use this to see what rclone diff --git a/fs/config.go b/fs/config.go index bbea2e32d..08181f068 100644 --- a/fs/config.go +++ b/fs/config.go @@ -83,6 +83,7 @@ var ( lowLevelRetries = pflag.IntP("low-level-retries", "", 10, "Number of low level retries to do.") updateOlder = pflag.BoolP("update", "u", false, "Skip files that are newer on the destination.") noGzip = pflag.BoolP("no-gzip-encoding", "", false, "Don't set Accept-Encoding: gzip.") + dedupeMode = pflag.StringP("dedupe-mode", "", "interactive", "Dedupe mode interactive|skip|first|newest|oldest|rename.") bwLimit SizeSuffix // Key to use for password en/decryption. @@ -203,6 +204,7 @@ type ConfigInfo struct { LowLevelRetries int UpdateOlder bool // Skip files that are newer on the destination NoGzip bool // Disable compression + DedupeMode DeduplicateMode } // Transport returns an http.RoundTripper with the correct timeouts @@ -311,6 +313,23 @@ func LoadConfig() { Config.DeleteDuring = *deleteDuring Config.DeleteAfter = *deleteAfter + switch strings.ToLower(*dedupeMode) { + case "interactive": + Config.DedupeMode = DeduplicateInteractive + case "skip": + Config.DedupeMode = DeduplicateSkip + case "first": + Config.DedupeMode = DeduplicateFirst + case "newest": + Config.DedupeMode = DeduplicateNewest + case "oldest": + Config.DedupeMode = DeduplicateOldest + case "rename": + Config.DedupeMode = DeduplicateRename + default: + log.Fatalf(`Unknown mode for --dedupe-mode %q.`, *dedupeMode) + } + switch { case *deleteBefore && (*deleteDuring || *deleteAfter), *deleteDuring && *deleteAfter: diff --git a/fs/operations.go b/fs/operations.go index e042fe106..04f901b9d 100644 --- a/fs/operations.go +++ b/fs/operations.go @@ -5,8 +5,10 @@ package fs import ( "fmt" "io" + "log" "mime" "path" + "sort" "strings" "sync" "sync/atomic" @@ -405,6 +407,23 @@ func PairMover(in ObjectPairChan, fdst Fs, wg *sync.WaitGroup) { } } +// DeleteFile deletes a single file respecting --dry-run and accumulating stats and errors. +func DeleteFile(dst Object) { + if Config.DryRun { + Log(dst, "Not deleting as --dry-run") + } else { + Stats.Checking(dst) + err := dst.Remove() + Stats.DoneChecking(dst) + if err != nil { + Stats.Error() + ErrorLog(dst, "Couldn't delete: %s", err) + } else { + Debug(dst, "Deleted") + } + } +} + // DeleteFiles removes all the files passed in the channel func DeleteFiles(toBeDeleted ObjectsChan) { var wg sync.WaitGroup @@ -413,19 +432,7 @@ func DeleteFiles(toBeDeleted ObjectsChan) { go func() { defer wg.Done() for dst := range toBeDeleted { - if Config.DryRun { - Log(dst, "Not deleting as --dry-run") - } else { - Stats.Checking(dst) - err := dst.Remove() - Stats.DoneChecking(dst) - if err != nil { - Stats.Error() - ErrorLog(dst, "Couldn't delete: %s", err) - } else { - Debug(dst, "Deleted") - } - } + DeleteFile(dst) } }() } @@ -958,15 +965,132 @@ func Delete(f Fs) error { return err } +// dedupeRename renames the objs slice to different names +func dedupeRename(remote string, objs []Object) { + f := objs[0].Fs() + mover, ok := f.(Mover) + if !ok { + log.Fatalf("Fs %v doesn't support Move", f) + } + ext := path.Ext(remote) + base := remote[:len(remote)-len(ext)] + for i, o := range objs { + newName := fmt.Sprintf("%s-%d%s", base, i+1, ext) + if !Config.DryRun { + newObj, err := mover.Move(o, newName) + if err != nil { + Stats.Error() + ErrorLog(o, "Failed to rename: %v", err) + continue + } + Log(newObj, "renamed from: %v", o) + } else { + Log(remote, "Not renaming to %q as --dry-run", newName) + } + } +} + +// dedupeDeleteAllButOne deletes all but the one in keep +func dedupeDeleteAllButOne(keep int, remote string, objs []Object) { + for i, o := range objs { + if i == keep { + continue + } + DeleteFile(o) + } + Log(remote, "Deleted %d extra copies", len(objs)-1) +} + +// dedupeDeleteIdentical deletes all but one of identical (by hash) copies +func dedupeDeleteIdentical(remote string, objs []Object) []Object { + // See how many of these duplicates are identical + byHash := make(map[string][]Object, len(objs)) + for _, o := range objs { + md5sum, err := o.Hash(HashMD5) + if err == nil { + byHash[md5sum] = append(byHash[md5sum], o) + } + } + + // Delete identical duplicates, refilling obj with the ones remaining + objs = nil + for md5sum, hashObjs := range byHash { + if len(hashObjs) > 1 { + Log(remote, "Deleting %d/%d identical duplicates (md5sum %q)", len(hashObjs)-1, len(hashObjs), md5sum) + for _, o := range hashObjs[1:] { + DeleteFile(o) + } + } + objs = append(objs, hashObjs[0]) + } + + return objs +} + +// dedupeInteractive interactively dedupes the slice of objects +func dedupeInteractive(remote string, objs []Object) { + fmt.Printf("%s: %d duplicates remain\n", remote, len(objs)) + for i, o := range objs { + md5sum, err := o.Hash(HashMD5) + if err != nil { + md5sum = err.Error() + } + fmt.Printf(" %d: %12d bytes, %s, md5sum %32s\n", i+1, o.Size(), o.ModTime().Format("2006-01-02 15:04:05.000000000"), md5sum) + } + switch Command([]string{"sSkip and do nothing", "kKeep just one (choose which in next step)", "rRename all to be different (by changing file.jpg to file-1.jpg)"}) { + case 's': + case 'k': + keep := ChooseNumber("Enter the number of the file to keep", 1, len(objs)) + dedupeDeleteAllButOne(keep-1, remote, objs) + case 'r': + dedupeRename(remote, objs) + } +} + +type objectsSortedByModTime []Object + +func (objs objectsSortedByModTime) Len() int { return len(objs) } +func (objs objectsSortedByModTime) Swap(i, j int) { objs[i], objs[j] = objs[j], objs[i] } +func (objs objectsSortedByModTime) Less(i, j int) bool { + return objs[i].ModTime().Before(objs[j].ModTime()) +} + +// DeduplicateMode is how the dedupe command chooses what to do +type DeduplicateMode int + +// Deduplicate modes +const ( + DeduplicateInteractive DeduplicateMode = iota // interactively ask the user + DeduplicateSkip // skip all conflicts + DeduplicateFirst // choose the first object + DeduplicateNewest // choose the newest object + DeduplicateOldest // choose the oldest object + DeduplicateRename // rename the objects +) + +func (mode DeduplicateMode) String() string { + switch mode { + case DeduplicateInteractive: + return "interactive" + case DeduplicateSkip: + return "skip" + case DeduplicateFirst: + return "first" + case DeduplicateNewest: + return "newest" + case DeduplicateOldest: + return "oldest" + case DeduplicateRename: + return "rename" + } + return "unknown" +} + // Deduplicate interactively finds duplicate files and offers to // delete all but one or rename them to be different. Only useful with // Google Drive which can have duplicate file names. -func Deduplicate(f Fs) error { - mover, ok := f.(Mover) - if !ok { - return fmt.Errorf("%v can't Move files", f) - } - Log(f, "Looking for duplicates") +func Deduplicate(f Fs, mode DeduplicateMode) error { + Log(f, "Looking for duplicates using %v mode.", mode) files := map[string][]Object{} for o := range f.List() { remote := o.Remote() @@ -974,43 +1098,29 @@ func Deduplicate(f Fs) error { } for remote, objs := range files { if len(objs) > 1 { - fmt.Printf("%s: Found %d duplicates\n", remote, len(objs)) - for i, o := range objs { - md5sum, err := o.Hash(HashMD5) - if err != nil { - md5sum = err.Error() - } - fmt.Printf(" %d: %12d bytes, %s, md5sum %32s\n", i+1, o.Size(), o.ModTime().Format("2006-01-02 15:04:05.000000000"), md5sum) + Log(remote, "Found %d duplicates - deleting identical copies", len(objs)) + objs = dedupeDeleteIdentical(remote, objs) + if len(objs) <= 1 { + Log(remote, "All duplicates removed") + continue } - switch Command([]string{"sSkip and do nothing", "kKeep just one (choose which in next step)", "rRename all to be different (by changing file.jpg to file-1.jpg)"}) { - case 's': - case 'k': - keep := ChooseNumber("Enter the number of the file to keep", 1, len(objs)) - deleted := 0 - for i, o := range objs { - if i+1 == keep { - continue - } - err := o.Remove() - if err != nil { - ErrorLog(o, "Failed to delete: %v", err) - continue - } - deleted++ - } - fmt.Printf("%s: Deleted %d extra copies\n", remote, deleted) - case 'r': - ext := path.Ext(remote) - base := remote[:len(remote)-len(ext)] - for i, o := range objs { - newName := fmt.Sprintf("%s-%d%s", base, i+1, ext) - newObj, err := mover.Move(o, newName) - if err != nil { - ErrorLog(o, "Failed to rename: %v", err) - continue - } - fmt.Printf("%v: renamed from: %v\n", newObj, o) - } + switch mode { + case DeduplicateInteractive: + dedupeInteractive(remote, objs) + case DeduplicateFirst: + dedupeDeleteAllButOne(0, remote, objs) + case DeduplicateNewest: + sort.Sort(objectsSortedByModTime(objs)) // sort oldest first + dedupeDeleteAllButOne(len(objs)-1, remote, objs) + case DeduplicateOldest: + sort.Sort(objectsSortedByModTime(objs)) // sort oldest first + dedupeDeleteAllButOne(0, remote, objs) + case DeduplicateRename: + dedupeRename(remote, objs) + case DeduplicateSkip: + // skip + default: + //skip } } } diff --git a/fs/operations_test.go b/fs/operations_test.go index 8906379bf..1fb84d301 100644 --- a/fs/operations_test.go +++ b/fs/operations_test.go @@ -975,3 +975,160 @@ func TestCheck(t *testing.T) { fstest.CheckItems(t, r.flocal, file1, file2, file3) check(5, 0) } + +func (r *Run) checkWithDuplicates(t *testing.T, items ...fstest.Item) { + objects, size, err := fs.Count(r.fremote) + if err != nil { + t.Fatalf("Error listing: %v", err) + } + if objects != int64(len(items)) { + t.Fatalf("Error listing want %d objects, got %d", len(items), objects) + } + wantSize := int64(0) + for _, item := range items { + wantSize += item.Size + } + if wantSize != size { + t.Fatalf("Error listing want %d size, got %d", wantSize, size) + } +} + +func TestDeduplicateInteractive(t *testing.T) { + if *RemoteName != "TestDrive:" { + t.Skip("Can only test deduplicate on google drive") + } + r := NewRun(t) + defer r.Finalise() + + file1 := r.WriteObject("one", "This is one", t1) + file2 := r.WriteObject("one", "This is one", t1) + file3 := r.WriteObject("one", "This is one", t1) + r.checkWithDuplicates(t, file1, file2, file3) + + err := fs.Deduplicate(r.fremote, fs.DeduplicateInteractive) + if err != nil { + t.Fatalf("fs.Deduplicate returned error: %v", err) + } + + fstest.CheckItems(t, r.fremote, file1) +} + +func TestDeduplicateSkip(t *testing.T) { + if *RemoteName != "TestDrive:" { + t.Skip("Can only test deduplicate on google drive") + } + r := NewRun(t) + defer r.Finalise() + + file1 := r.WriteObject("one", "This is one", t1) + file2 := r.WriteObject("one", "This is one", t1) + file3 := r.WriteObject("one", "This is another one", t1) + r.checkWithDuplicates(t, file1, file2, file3) + + err := fs.Deduplicate(r.fremote, fs.DeduplicateSkip) + if err != nil { + t.Fatalf("fs.Deduplicate returned error: %v", err) + } + + r.checkWithDuplicates(t, file1, file3) +} + +func TestDeduplicateFirst(t *testing.T) { + if *RemoteName != "TestDrive:" { + t.Skip("Can only test deduplicate on google drive") + } + r := NewRun(t) + defer r.Finalise() + + file1 := r.WriteObject("one", "This is one", t1) + file2 := r.WriteObject("one", "This is one A", t1) + file3 := r.WriteObject("one", "This is one BB", t1) + r.checkWithDuplicates(t, file1, file2, file3) + + err := fs.Deduplicate(r.fremote, fs.DeduplicateFirst) + if err != nil { + t.Fatalf("fs.Deduplicate returned error: %v", err) + } + + objects, size, err := fs.Count(r.fremote) + if err != nil { + t.Fatalf("Error listing: %v", err) + } + if objects != 1 { + t.Errorf("Expecting 1 object got %v", objects) + } + if size != file1.Size && size != file2.Size && size != file3.Size { + t.Errorf("Size not one of the object sizes %d", size) + } +} + +func TestDeduplicateNewest(t *testing.T) { + if *RemoteName != "TestDrive:" { + t.Skip("Can only test deduplicate on google drive") + } + r := NewRun(t) + defer r.Finalise() + + file1 := r.WriteObject("one", "This is one", t1) + file2 := r.WriteObject("one", "This is one too", t2) + file3 := r.WriteObject("one", "This is another one", t3) + r.checkWithDuplicates(t, file1, file2, file3) + + err := fs.Deduplicate(r.fremote, fs.DeduplicateNewest) + if err != nil { + t.Fatalf("fs.Deduplicate returned error: %v", err) + } + + fstest.CheckItems(t, r.fremote, file3) +} + +func TestDeduplicateOldest(t *testing.T) { + if *RemoteName != "TestDrive:" { + t.Skip("Can only test deduplicate on google drive") + } + r := NewRun(t) + defer r.Finalise() + + file1 := r.WriteObject("one", "This is one", t1) + file2 := r.WriteObject("one", "This is one too", t2) + file3 := r.WriteObject("one", "This is another one", t3) + r.checkWithDuplicates(t, file1, file2, file3) + + err := fs.Deduplicate(r.fremote, fs.DeduplicateOldest) + if err != nil { + t.Fatalf("fs.Deduplicate returned error: %v", err) + } + + fstest.CheckItems(t, r.fremote, file1) +} + +func TestDeduplicateRename(t *testing.T) { + if *RemoteName != "TestDrive:" { + t.Skip("Can only test deduplicate on google drive") + } + r := NewRun(t) + defer r.Finalise() + + file1 := r.WriteObject("one.txt", "This is one", t1) + file2 := r.WriteObject("one.txt", "This is one too", t2) + file3 := r.WriteObject("one.txt", "This is another one", t3) + r.checkWithDuplicates(t, file1, file2, file3) + + err := fs.Deduplicate(r.fremote, fs.DeduplicateRename) + if err != nil { + t.Fatalf("fs.Deduplicate returned error: %v", err) + } + + for o := range r.fremote.List() { + remote := o.Remote() + if remote != "one-1.txt" && + remote != "one-2.txt" && + remote != "one-3.txt" { + t.Errorf("Bad file name after rename %q", remote) + } + size := o.Size() + if size != file1.Size && size != file2.Size && size != file3.Size { + t.Errorf("Size not one of the object sizes %d", size) + } + } +} diff --git a/rclone.go b/rclone.go index d51bfa504..32b7b0b3a 100644 --- a/rclone.go +++ b/rclone.go @@ -248,7 +248,7 @@ var Commands = []Command{ but one or rename them to be different. Only useful with Google Drive which can have duplicate file names.`, Run: func(fdst, fsrc fs.Fs) error { - return fs.Deduplicate(fdst) + return fs.Deduplicate(fdst, fs.Config.DedupeMode) }, MinArgs: 1, MaxArgs: 1,