fs/operations: make dedupe work with mega
* factor into its own files * remove assumptions about having a given hash type * make tests work if the remote has no hash
This commit is contained in:
parent
792c9e185e
commit
d97fe3b824
4 changed files with 491 additions and 438 deletions
292
fs/operations/dedupe.go
Normal file
292
fs/operations/dedupe.go
Normal file
|
@ -0,0 +1,292 @@
|
|||
// dedupe - gets rid of identical files remotes which can have duplicate file names (drive, mega)
|
||||
|
||||
package operations
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"log"
|
||||
"path"
|
||||
"sort"
|
||||
"strings"
|
||||
|
||||
"github.com/ncw/rclone/fs"
|
||||
"github.com/ncw/rclone/fs/config"
|
||||
"github.com/ncw/rclone/fs/hash"
|
||||
"github.com/ncw/rclone/fs/walk"
|
||||
"github.com/pkg/errors"
|
||||
"github.com/spf13/pflag"
|
||||
)
|
||||
|
||||
// dedupeRename renames the objs slice to different names
|
||||
func dedupeRename(remote string, objs []fs.Object) {
|
||||
f := objs[0].Fs()
|
||||
doMove := f.Features().Move
|
||||
if doMove == nil {
|
||||
log.Fatalf("Fs %v doesn't support Move", f)
|
||||
}
|
||||
ext := path.Ext(remote)
|
||||
base := remote[:len(remote)-len(ext)]
|
||||
for i, o := range objs {
|
||||
newName := fmt.Sprintf("%s-%d%s", base, i+1, ext)
|
||||
if !fs.Config.DryRun {
|
||||
newObj, err := doMove(o, newName)
|
||||
if err != nil {
|
||||
fs.CountError(err)
|
||||
fs.Errorf(o, "Failed to rename: %v", err)
|
||||
continue
|
||||
}
|
||||
fs.Infof(newObj, "renamed from: %v", o)
|
||||
} else {
|
||||
fs.Logf(remote, "Not renaming to %q as --dry-run", newName)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// dedupeDeleteAllButOne deletes all but the one in keep
|
||||
func dedupeDeleteAllButOne(keep int, remote string, objs []fs.Object) {
|
||||
for i, o := range objs {
|
||||
if i == keep {
|
||||
continue
|
||||
}
|
||||
_ = DeleteFile(o)
|
||||
}
|
||||
fs.Logf(remote, "Deleted %d extra copies", len(objs)-1)
|
||||
}
|
||||
|
||||
// dedupeDeleteIdentical deletes all but one of identical (by hash) copies
|
||||
func dedupeDeleteIdentical(ht hash.Type, remote string, objs []fs.Object) (remainingObjs []fs.Object) {
|
||||
// See how many of these duplicates are identical
|
||||
byHash := make(map[string][]fs.Object, len(objs))
|
||||
for _, o := range objs {
|
||||
md5sum, err := o.Hash(ht)
|
||||
if err != nil || md5sum == "" {
|
||||
remainingObjs = append(remainingObjs, o)
|
||||
} else {
|
||||
byHash[md5sum] = append(byHash[md5sum], o)
|
||||
}
|
||||
}
|
||||
|
||||
// Delete identical duplicates, filling remainingObjs with the ones remaining
|
||||
for md5sum, hashObjs := range byHash {
|
||||
if len(hashObjs) > 1 {
|
||||
fs.Logf(remote, "Deleting %d/%d identical duplicates (%v %q)", len(hashObjs)-1, len(hashObjs), ht, md5sum)
|
||||
for _, o := range hashObjs[1:] {
|
||||
_ = DeleteFile(o)
|
||||
}
|
||||
}
|
||||
remainingObjs = append(remainingObjs, hashObjs[0])
|
||||
}
|
||||
|
||||
return remainingObjs
|
||||
}
|
||||
|
||||
// dedupeInteractive interactively dedupes the slice of objects
|
||||
func dedupeInteractive(ht hash.Type, remote string, objs []fs.Object) {
|
||||
fmt.Printf("%s: %d duplicates remain\n", remote, len(objs))
|
||||
for i, o := range objs {
|
||||
md5sum, err := o.Hash(ht)
|
||||
if err != nil {
|
||||
md5sum = err.Error()
|
||||
}
|
||||
fmt.Printf(" %d: %12d bytes, %s, %v %32s\n", i+1, o.Size(), o.ModTime().Local().Format("2006-01-02 15:04:05.000000000"), ht, md5sum)
|
||||
}
|
||||
switch config.Command([]string{"sSkip and do nothing", "kKeep just one (choose which in next step)", "rRename all to be different (by changing file.jpg to file-1.jpg)"}) {
|
||||
case 's':
|
||||
case 'k':
|
||||
keep := config.ChooseNumber("Enter the number of the file to keep", 1, len(objs))
|
||||
dedupeDeleteAllButOne(keep-1, remote, objs)
|
||||
case 'r':
|
||||
dedupeRename(remote, objs)
|
||||
}
|
||||
}
|
||||
|
||||
type objectsSortedByModTime []fs.Object
|
||||
|
||||
func (objs objectsSortedByModTime) Len() int { return len(objs) }
|
||||
func (objs objectsSortedByModTime) Swap(i, j int) { objs[i], objs[j] = objs[j], objs[i] }
|
||||
func (objs objectsSortedByModTime) Less(i, j int) bool {
|
||||
return objs[i].ModTime().Before(objs[j].ModTime())
|
||||
}
|
||||
|
||||
// DeduplicateMode is how the dedupe command chooses what to do
|
||||
type DeduplicateMode int
|
||||
|
||||
// Deduplicate modes
|
||||
const (
|
||||
DeduplicateInteractive DeduplicateMode = iota // interactively ask the user
|
||||
DeduplicateSkip // skip all conflicts
|
||||
DeduplicateFirst // choose the first object
|
||||
DeduplicateNewest // choose the newest object
|
||||
DeduplicateOldest // choose the oldest object
|
||||
DeduplicateRename // rename the objects
|
||||
)
|
||||
|
||||
func (x DeduplicateMode) String() string {
|
||||
switch x {
|
||||
case DeduplicateInteractive:
|
||||
return "interactive"
|
||||
case DeduplicateSkip:
|
||||
return "skip"
|
||||
case DeduplicateFirst:
|
||||
return "first"
|
||||
case DeduplicateNewest:
|
||||
return "newest"
|
||||
case DeduplicateOldest:
|
||||
return "oldest"
|
||||
case DeduplicateRename:
|
||||
return "rename"
|
||||
}
|
||||
return "unknown"
|
||||
}
|
||||
|
||||
// Set a DeduplicateMode from a string
|
||||
func (x *DeduplicateMode) Set(s string) error {
|
||||
switch strings.ToLower(s) {
|
||||
case "interactive":
|
||||
*x = DeduplicateInteractive
|
||||
case "skip":
|
||||
*x = DeduplicateSkip
|
||||
case "first":
|
||||
*x = DeduplicateFirst
|
||||
case "newest":
|
||||
*x = DeduplicateNewest
|
||||
case "oldest":
|
||||
*x = DeduplicateOldest
|
||||
case "rename":
|
||||
*x = DeduplicateRename
|
||||
default:
|
||||
return errors.Errorf("Unknown mode for dedupe %q.", s)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Type of the value
|
||||
func (x *DeduplicateMode) Type() string {
|
||||
return "string"
|
||||
}
|
||||
|
||||
// Check it satisfies the interface
|
||||
var _ pflag.Value = (*DeduplicateMode)(nil)
|
||||
|
||||
// dedupeFindDuplicateDirs scans f for duplicate directories
|
||||
func dedupeFindDuplicateDirs(f fs.Fs) ([][]fs.Directory, error) {
|
||||
duplicateDirs := [][]fs.Directory{}
|
||||
err := walk.Walk(f, "", true, fs.Config.MaxDepth, func(dirPath string, entries fs.DirEntries, err error) error {
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
dirs := map[string][]fs.Directory{}
|
||||
entries.ForDir(func(d fs.Directory) {
|
||||
dirs[d.Remote()] = append(dirs[d.Remote()], d)
|
||||
})
|
||||
for _, ds := range dirs {
|
||||
if len(ds) > 1 {
|
||||
duplicateDirs = append(duplicateDirs, ds)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
})
|
||||
if err != nil {
|
||||
return nil, errors.Wrap(err, "find duplicate dirs")
|
||||
}
|
||||
return duplicateDirs, nil
|
||||
}
|
||||
|
||||
// dedupeMergeDuplicateDirs merges all the duplicate directories found
|
||||
func dedupeMergeDuplicateDirs(f fs.Fs, duplicateDirs [][]fs.Directory) error {
|
||||
mergeDirs := f.Features().MergeDirs
|
||||
if mergeDirs == nil {
|
||||
return errors.Errorf("%v: can't merge directories", f)
|
||||
}
|
||||
dirCacheFlush := f.Features().DirCacheFlush
|
||||
if dirCacheFlush == nil {
|
||||
return errors.Errorf("%v: can't flush dir cache", f)
|
||||
}
|
||||
for _, dirs := range duplicateDirs {
|
||||
if !fs.Config.DryRun {
|
||||
fs.Infof(dirs[0], "Merging contents of duplicate directories")
|
||||
err := mergeDirs(dirs)
|
||||
if err != nil {
|
||||
return errors.Wrap(err, "merge duplicate dirs")
|
||||
}
|
||||
} else {
|
||||
fs.Infof(dirs[0], "NOT Merging contents of duplicate directories as --dry-run")
|
||||
}
|
||||
}
|
||||
dirCacheFlush()
|
||||
return nil
|
||||
}
|
||||
|
||||
// Deduplicate interactively finds duplicate files and offers to
|
||||
// delete all but one or rename them to be different. Only useful with
|
||||
// Google Drive which can have duplicate file names.
|
||||
func Deduplicate(f fs.Fs, mode DeduplicateMode) error {
|
||||
fs.Infof(f, "Looking for duplicates using %v mode.", mode)
|
||||
|
||||
// Find duplicate directories first and fix them - repeat
|
||||
// until all fixed
|
||||
for {
|
||||
duplicateDirs, err := dedupeFindDuplicateDirs(f)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if len(duplicateDirs) == 0 {
|
||||
break
|
||||
}
|
||||
err = dedupeMergeDuplicateDirs(f, duplicateDirs)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if fs.Config.DryRun {
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
// find a hash to use
|
||||
ht := f.Hashes().GetOne()
|
||||
|
||||
// Now find duplicate files
|
||||
files := map[string][]fs.Object{}
|
||||
err := walk.Walk(f, "", true, fs.Config.MaxDepth, func(dirPath string, entries fs.DirEntries, err error) error {
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
entries.ForObject(func(o fs.Object) {
|
||||
remote := o.Remote()
|
||||
files[remote] = append(files[remote], o)
|
||||
})
|
||||
return nil
|
||||
})
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
for remote, objs := range files {
|
||||
if len(objs) > 1 {
|
||||
fs.Logf(remote, "Found %d duplicates - deleting identical copies", len(objs))
|
||||
objs = dedupeDeleteIdentical(ht, remote, objs)
|
||||
if len(objs) <= 1 {
|
||||
fs.Logf(remote, "All duplicates removed")
|
||||
continue
|
||||
}
|
||||
switch mode {
|
||||
case DeduplicateInteractive:
|
||||
dedupeInteractive(ht, remote, objs)
|
||||
case DeduplicateFirst:
|
||||
dedupeDeleteAllButOne(0, remote, objs)
|
||||
case DeduplicateNewest:
|
||||
sort.Sort(objectsSortedByModTime(objs)) // sort oldest first
|
||||
dedupeDeleteAllButOne(len(objs)-1, remote, objs)
|
||||
case DeduplicateOldest:
|
||||
sort.Sort(objectsSortedByModTime(objs)) // sort oldest first
|
||||
dedupeDeleteAllButOne(0, remote, objs)
|
||||
case DeduplicateRename:
|
||||
dedupeRename(remote, objs)
|
||||
case DeduplicateSkip:
|
||||
// skip
|
||||
default:
|
||||
//skip
|
||||
}
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
199
fs/operations/dedupe_test.go
Normal file
199
fs/operations/dedupe_test.go
Normal file
|
@ -0,0 +1,199 @@
|
|||
package operations_test
|
||||
|
||||
import (
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/ncw/rclone/fs"
|
||||
"github.com/ncw/rclone/fs/hash"
|
||||
"github.com/ncw/rclone/fs/operations"
|
||||
"github.com/ncw/rclone/fs/walk"
|
||||
"github.com/ncw/rclone/fstest"
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
func skipIfCantDedupe(t *testing.T, f fs.Fs) {
|
||||
if !f.Features().DuplicateFiles {
|
||||
t.Skip("Can't test deduplicate - no duplicate files possible")
|
||||
}
|
||||
if f.Features().PutUnchecked == nil {
|
||||
t.Skip("Can't test deduplicate - no PutUnchecked")
|
||||
}
|
||||
if f.Features().MergeDirs == nil {
|
||||
t.Skip("Can't test deduplicate - no MergeDirs")
|
||||
}
|
||||
}
|
||||
|
||||
func skipIfNoHash(t *testing.T, f fs.Fs) {
|
||||
if f.Hashes().GetOne() == hash.None {
|
||||
t.Skip("Can't run this test without a hash")
|
||||
}
|
||||
}
|
||||
|
||||
func TestDeduplicateInteractive(t *testing.T) {
|
||||
r := fstest.NewRun(t)
|
||||
defer r.Finalise()
|
||||
skipIfCantDedupe(t, r.Fremote)
|
||||
skipIfNoHash(t, r.Fremote)
|
||||
|
||||
file1 := r.WriteUncheckedObject("one", "This is one", t1)
|
||||
file2 := r.WriteUncheckedObject("one", "This is one", t1)
|
||||
file3 := r.WriteUncheckedObject("one", "This is one", t1)
|
||||
r.CheckWithDuplicates(t, file1, file2, file3)
|
||||
|
||||
err := operations.Deduplicate(r.Fremote, operations.DeduplicateInteractive)
|
||||
require.NoError(t, err)
|
||||
|
||||
fstest.CheckItems(t, r.Fremote, file1)
|
||||
}
|
||||
|
||||
func TestDeduplicateSkip(t *testing.T) {
|
||||
r := fstest.NewRun(t)
|
||||
defer r.Finalise()
|
||||
skipIfCantDedupe(t, r.Fremote)
|
||||
haveHash := r.Fremote.Hashes().GetOne() != hash.None
|
||||
|
||||
file1 := r.WriteUncheckedObject("one", "This is one", t1)
|
||||
files := []fstest.Item{file1}
|
||||
if haveHash {
|
||||
file2 := r.WriteUncheckedObject("one", "This is one", t1)
|
||||
files = append(files, file2)
|
||||
}
|
||||
file3 := r.WriteUncheckedObject("one", "This is another one", t1)
|
||||
files = append(files, file3)
|
||||
r.CheckWithDuplicates(t, files...)
|
||||
|
||||
err := operations.Deduplicate(r.Fremote, operations.DeduplicateSkip)
|
||||
require.NoError(t, err)
|
||||
|
||||
r.CheckWithDuplicates(t, file1, file3)
|
||||
}
|
||||
|
||||
func TestDeduplicateFirst(t *testing.T) {
|
||||
r := fstest.NewRun(t)
|
||||
defer r.Finalise()
|
||||
skipIfCantDedupe(t, r.Fremote)
|
||||
|
||||
file1 := r.WriteUncheckedObject("one", "This is one", t1)
|
||||
file2 := r.WriteUncheckedObject("one", "This is one A", t1)
|
||||
file3 := r.WriteUncheckedObject("one", "This is one BB", t1)
|
||||
r.CheckWithDuplicates(t, file1, file2, file3)
|
||||
|
||||
err := operations.Deduplicate(r.Fremote, operations.DeduplicateFirst)
|
||||
require.NoError(t, err)
|
||||
|
||||
// list until we get one object
|
||||
var objects, size int64
|
||||
for try := 1; try <= *fstest.ListRetries; try++ {
|
||||
objects, size, err = operations.Count(r.Fremote)
|
||||
require.NoError(t, err)
|
||||
if objects == 1 {
|
||||
break
|
||||
}
|
||||
time.Sleep(time.Second)
|
||||
}
|
||||
assert.Equal(t, int64(1), objects)
|
||||
if size != file1.Size && size != file2.Size && size != file3.Size {
|
||||
t.Errorf("Size not one of the object sizes %d", size)
|
||||
}
|
||||
}
|
||||
|
||||
func TestDeduplicateNewest(t *testing.T) {
|
||||
r := fstest.NewRun(t)
|
||||
defer r.Finalise()
|
||||
skipIfCantDedupe(t, r.Fremote)
|
||||
|
||||
file1 := r.WriteUncheckedObject("one", "This is one", t1)
|
||||
file2 := r.WriteUncheckedObject("one", "This is one too", t2)
|
||||
file3 := r.WriteUncheckedObject("one", "This is another one", t3)
|
||||
r.CheckWithDuplicates(t, file1, file2, file3)
|
||||
|
||||
err := operations.Deduplicate(r.Fremote, operations.DeduplicateNewest)
|
||||
require.NoError(t, err)
|
||||
|
||||
fstest.CheckItems(t, r.Fremote, file3)
|
||||
}
|
||||
|
||||
func TestDeduplicateOldest(t *testing.T) {
|
||||
r := fstest.NewRun(t)
|
||||
defer r.Finalise()
|
||||
skipIfCantDedupe(t, r.Fremote)
|
||||
|
||||
file1 := r.WriteUncheckedObject("one", "This is one", t1)
|
||||
file2 := r.WriteUncheckedObject("one", "This is one too", t2)
|
||||
file3 := r.WriteUncheckedObject("one", "This is another one", t3)
|
||||
r.CheckWithDuplicates(t, file1, file2, file3)
|
||||
|
||||
err := operations.Deduplicate(r.Fremote, operations.DeduplicateOldest)
|
||||
require.NoError(t, err)
|
||||
|
||||
fstest.CheckItems(t, r.Fremote, file1)
|
||||
}
|
||||
|
||||
func TestDeduplicateRename(t *testing.T) {
|
||||
r := fstest.NewRun(t)
|
||||
defer r.Finalise()
|
||||
skipIfCantDedupe(t, r.Fremote)
|
||||
|
||||
file1 := r.WriteUncheckedObject("one.txt", "This is one", t1)
|
||||
file2 := r.WriteUncheckedObject("one.txt", "This is one too", t2)
|
||||
file3 := r.WriteUncheckedObject("one.txt", "This is another one", t3)
|
||||
r.CheckWithDuplicates(t, file1, file2, file3)
|
||||
|
||||
err := operations.Deduplicate(r.Fremote, operations.DeduplicateRename)
|
||||
require.NoError(t, err)
|
||||
|
||||
require.NoError(t, walk.Walk(r.Fremote, "", true, -1, func(dirPath string, entries fs.DirEntries, err error) error {
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
entries.ForObject(func(o fs.Object) {
|
||||
remote := o.Remote()
|
||||
if remote != "one-1.txt" &&
|
||||
remote != "one-2.txt" &&
|
||||
remote != "one-3.txt" {
|
||||
t.Errorf("Bad file name after rename %q", remote)
|
||||
}
|
||||
size := o.Size()
|
||||
if size != file1.Size && size != file2.Size && size != file3.Size {
|
||||
t.Errorf("Size not one of the object sizes %d", size)
|
||||
}
|
||||
})
|
||||
return nil
|
||||
}))
|
||||
}
|
||||
|
||||
// This should really be a unit test, but the test framework there
|
||||
// doesn't have enough tools to make it easy
|
||||
func TestMergeDirs(t *testing.T) {
|
||||
r := fstest.NewRun(t)
|
||||
defer r.Finalise()
|
||||
|
||||
mergeDirs := r.Fremote.Features().MergeDirs
|
||||
if mergeDirs == nil {
|
||||
t.Skip("Can't merge directories")
|
||||
}
|
||||
|
||||
file1 := r.WriteObject("dupe1/one.txt", "This is one", t1)
|
||||
file2 := r.WriteObject("dupe2/two.txt", "This is one too", t2)
|
||||
file3 := r.WriteObject("dupe3/three.txt", "This is another one", t3)
|
||||
|
||||
objs, dirs, err := walk.GetAll(r.Fremote, "", true, 1)
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, 3, len(dirs))
|
||||
assert.Equal(t, 0, len(objs))
|
||||
|
||||
err = mergeDirs(dirs)
|
||||
require.NoError(t, err)
|
||||
|
||||
file2.Path = "dupe1/two.txt"
|
||||
file3.Path = "dupe1/three.txt"
|
||||
fstest.CheckItems(t, r.Fremote, file1, file2, file3)
|
||||
|
||||
objs, dirs, err = walk.GetAll(r.Fremote, "", true, 1)
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, 1, len(dirs))
|
||||
assert.Equal(t, 0, len(objs))
|
||||
assert.Equal(t, "dupe1", dirs[0].Remote())
|
||||
}
|
|
@ -7,7 +7,6 @@ import (
|
|||
"fmt"
|
||||
"io"
|
||||
"io/ioutil"
|
||||
"log"
|
||||
"path"
|
||||
"sort"
|
||||
"strconv"
|
||||
|
@ -18,7 +17,6 @@ import (
|
|||
|
||||
"github.com/ncw/rclone/fs"
|
||||
"github.com/ncw/rclone/fs/accounting"
|
||||
"github.com/ncw/rclone/fs/config"
|
||||
"github.com/ncw/rclone/fs/fserrors"
|
||||
"github.com/ncw/rclone/fs/hash"
|
||||
"github.com/ncw/rclone/fs/march"
|
||||
|
@ -26,7 +24,6 @@ import (
|
|||
"github.com/ncw/rclone/fs/walk"
|
||||
"github.com/ncw/rclone/lib/readers"
|
||||
"github.com/pkg/errors"
|
||||
"github.com/spf13/pflag"
|
||||
)
|
||||
|
||||
// CheckHashes checks the two files to see if they have common
|
||||
|
@ -1010,276 +1007,6 @@ func Delete(f fs.Fs) error {
|
|||
return err
|
||||
}
|
||||
|
||||
// dedupeRename renames the objs slice to different names
|
||||
func dedupeRename(remote string, objs []fs.Object) {
|
||||
f := objs[0].Fs()
|
||||
doMove := f.Features().Move
|
||||
if doMove == nil {
|
||||
log.Fatalf("Fs %v doesn't support Move", f)
|
||||
}
|
||||
ext := path.Ext(remote)
|
||||
base := remote[:len(remote)-len(ext)]
|
||||
for i, o := range objs {
|
||||
newName := fmt.Sprintf("%s-%d%s", base, i+1, ext)
|
||||
if !fs.Config.DryRun {
|
||||
newObj, err := doMove(o, newName)
|
||||
if err != nil {
|
||||
fs.CountError(err)
|
||||
fs.Errorf(o, "Failed to rename: %v", err)
|
||||
continue
|
||||
}
|
||||
fs.Infof(newObj, "renamed from: %v", o)
|
||||
} else {
|
||||
fs.Logf(remote, "Not renaming to %q as --dry-run", newName)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// dedupeDeleteAllButOne deletes all but the one in keep
|
||||
func dedupeDeleteAllButOne(keep int, remote string, objs []fs.Object) {
|
||||
for i, o := range objs {
|
||||
if i == keep {
|
||||
continue
|
||||
}
|
||||
_ = DeleteFile(o)
|
||||
}
|
||||
fs.Logf(remote, "Deleted %d extra copies", len(objs)-1)
|
||||
}
|
||||
|
||||
// dedupeDeleteIdentical deletes all but one of identical (by hash) copies
|
||||
func dedupeDeleteIdentical(remote string, objs []fs.Object) []fs.Object {
|
||||
// See how many of these duplicates are identical
|
||||
byHash := make(map[string][]fs.Object, len(objs))
|
||||
for _, o := range objs {
|
||||
md5sum, err := o.Hash(hash.MD5)
|
||||
if err == nil {
|
||||
byHash[md5sum] = append(byHash[md5sum], o)
|
||||
}
|
||||
}
|
||||
|
||||
// Delete identical duplicates, refilling obj with the ones remaining
|
||||
objs = nil
|
||||
for md5sum, hashObjs := range byHash {
|
||||
if len(hashObjs) > 1 {
|
||||
fs.Logf(remote, "Deleting %d/%d identical duplicates (md5sum %q)", len(hashObjs)-1, len(hashObjs), md5sum)
|
||||
for _, o := range hashObjs[1:] {
|
||||
_ = DeleteFile(o)
|
||||
}
|
||||
}
|
||||
objs = append(objs, hashObjs[0])
|
||||
}
|
||||
|
||||
return objs
|
||||
}
|
||||
|
||||
// dedupeInteractive interactively dedupes the slice of objects
|
||||
func dedupeInteractive(remote string, objs []fs.Object) {
|
||||
fmt.Printf("%s: %d duplicates remain\n", remote, len(objs))
|
||||
for i, o := range objs {
|
||||
md5sum, err := o.Hash(hash.MD5)
|
||||
if err != nil {
|
||||
md5sum = err.Error()
|
||||
}
|
||||
fmt.Printf(" %d: %12d bytes, %s, md5sum %32s\n", i+1, o.Size(), o.ModTime().Local().Format("2006-01-02 15:04:05.000000000"), md5sum)
|
||||
}
|
||||
switch config.Command([]string{"sSkip and do nothing", "kKeep just one (choose which in next step)", "rRename all to be different (by changing file.jpg to file-1.jpg)"}) {
|
||||
case 's':
|
||||
case 'k':
|
||||
keep := config.ChooseNumber("Enter the number of the file to keep", 1, len(objs))
|
||||
dedupeDeleteAllButOne(keep-1, remote, objs)
|
||||
case 'r':
|
||||
dedupeRename(remote, objs)
|
||||
}
|
||||
}
|
||||
|
||||
type objectsSortedByModTime []fs.Object
|
||||
|
||||
func (objs objectsSortedByModTime) Len() int { return len(objs) }
|
||||
func (objs objectsSortedByModTime) Swap(i, j int) { objs[i], objs[j] = objs[j], objs[i] }
|
||||
func (objs objectsSortedByModTime) Less(i, j int) bool {
|
||||
return objs[i].ModTime().Before(objs[j].ModTime())
|
||||
}
|
||||
|
||||
// DeduplicateMode is how the dedupe command chooses what to do
|
||||
type DeduplicateMode int
|
||||
|
||||
// Deduplicate modes
|
||||
const (
|
||||
DeduplicateInteractive DeduplicateMode = iota // interactively ask the user
|
||||
DeduplicateSkip // skip all conflicts
|
||||
DeduplicateFirst // choose the first object
|
||||
DeduplicateNewest // choose the newest object
|
||||
DeduplicateOldest // choose the oldest object
|
||||
DeduplicateRename // rename the objects
|
||||
)
|
||||
|
||||
func (x DeduplicateMode) String() string {
|
||||
switch x {
|
||||
case DeduplicateInteractive:
|
||||
return "interactive"
|
||||
case DeduplicateSkip:
|
||||
return "skip"
|
||||
case DeduplicateFirst:
|
||||
return "first"
|
||||
case DeduplicateNewest:
|
||||
return "newest"
|
||||
case DeduplicateOldest:
|
||||
return "oldest"
|
||||
case DeduplicateRename:
|
||||
return "rename"
|
||||
}
|
||||
return "unknown"
|
||||
}
|
||||
|
||||
// Set a DeduplicateMode from a string
|
||||
func (x *DeduplicateMode) Set(s string) error {
|
||||
switch strings.ToLower(s) {
|
||||
case "interactive":
|
||||
*x = DeduplicateInteractive
|
||||
case "skip":
|
||||
*x = DeduplicateSkip
|
||||
case "first":
|
||||
*x = DeduplicateFirst
|
||||
case "newest":
|
||||
*x = DeduplicateNewest
|
||||
case "oldest":
|
||||
*x = DeduplicateOldest
|
||||
case "rename":
|
||||
*x = DeduplicateRename
|
||||
default:
|
||||
return errors.Errorf("Unknown mode for dedupe %q.", s)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Type of the value
|
||||
func (x *DeduplicateMode) Type() string {
|
||||
return "string"
|
||||
}
|
||||
|
||||
// Check it satisfies the interface
|
||||
var _ pflag.Value = (*DeduplicateMode)(nil)
|
||||
|
||||
// dedupeFindDuplicateDirs scans f for duplicate directories
|
||||
func dedupeFindDuplicateDirs(f fs.Fs) ([][]fs.Directory, error) {
|
||||
duplicateDirs := [][]fs.Directory{}
|
||||
err := walk.Walk(f, "", true, fs.Config.MaxDepth, func(dirPath string, entries fs.DirEntries, err error) error {
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
dirs := map[string][]fs.Directory{}
|
||||
entries.ForDir(func(d fs.Directory) {
|
||||
dirs[d.Remote()] = append(dirs[d.Remote()], d)
|
||||
})
|
||||
for _, ds := range dirs {
|
||||
if len(ds) > 1 {
|
||||
duplicateDirs = append(duplicateDirs, ds)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
})
|
||||
if err != nil {
|
||||
return nil, errors.Wrap(err, "find duplicate dirs")
|
||||
}
|
||||
return duplicateDirs, nil
|
||||
}
|
||||
|
||||
// dedupeMergeDuplicateDirs merges all the duplicate directories found
|
||||
func dedupeMergeDuplicateDirs(f fs.Fs, duplicateDirs [][]fs.Directory) error {
|
||||
mergeDirs := f.Features().MergeDirs
|
||||
if mergeDirs == nil {
|
||||
return errors.Errorf("%v: can't merge directories", f)
|
||||
}
|
||||
dirCacheFlush := f.Features().DirCacheFlush
|
||||
if dirCacheFlush == nil {
|
||||
return errors.Errorf("%v: can't flush dir cache", f)
|
||||
}
|
||||
for _, dirs := range duplicateDirs {
|
||||
if !fs.Config.DryRun {
|
||||
fs.Infof(dirs[0], "Merging contents of duplicate directories")
|
||||
err := mergeDirs(dirs)
|
||||
if err != nil {
|
||||
return errors.Wrap(err, "merge duplicate dirs")
|
||||
}
|
||||
} else {
|
||||
fs.Infof(dirs[0], "NOT Merging contents of duplicate directories as --dry-run")
|
||||
}
|
||||
}
|
||||
dirCacheFlush()
|
||||
return nil
|
||||
}
|
||||
|
||||
// Deduplicate interactively finds duplicate files and offers to
|
||||
// delete all but one or rename them to be different. Only useful with
|
||||
// Google Drive which can have duplicate file names.
|
||||
func Deduplicate(f fs.Fs, mode DeduplicateMode) error {
|
||||
fs.Infof(f, "Looking for duplicates using %v mode.", mode)
|
||||
|
||||
// Find duplicate directories first and fix them - repeat
|
||||
// until all fixed
|
||||
for {
|
||||
duplicateDirs, err := dedupeFindDuplicateDirs(f)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if len(duplicateDirs) == 0 {
|
||||
break
|
||||
}
|
||||
err = dedupeMergeDuplicateDirs(f, duplicateDirs)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if fs.Config.DryRun {
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
// Now find duplicate files
|
||||
files := map[string][]fs.Object{}
|
||||
err := walk.Walk(f, "", true, fs.Config.MaxDepth, func(dirPath string, entries fs.DirEntries, err error) error {
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
entries.ForObject(func(o fs.Object) {
|
||||
remote := o.Remote()
|
||||
files[remote] = append(files[remote], o)
|
||||
})
|
||||
return nil
|
||||
})
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
for remote, objs := range files {
|
||||
if len(objs) > 1 {
|
||||
fs.Logf(remote, "Found %d duplicates - deleting identical copies", len(objs))
|
||||
objs = dedupeDeleteIdentical(remote, objs)
|
||||
if len(objs) <= 1 {
|
||||
fs.Logf(remote, "All duplicates removed")
|
||||
continue
|
||||
}
|
||||
switch mode {
|
||||
case DeduplicateInteractive:
|
||||
dedupeInteractive(remote, objs)
|
||||
case DeduplicateFirst:
|
||||
dedupeDeleteAllButOne(0, remote, objs)
|
||||
case DeduplicateNewest:
|
||||
sort.Sort(objectsSortedByModTime(objs)) // sort oldest first
|
||||
dedupeDeleteAllButOne(len(objs)-1, remote, objs)
|
||||
case DeduplicateOldest:
|
||||
sort.Sort(objectsSortedByModTime(objs)) // sort oldest first
|
||||
dedupeDeleteAllButOne(0, remote, objs)
|
||||
case DeduplicateRename:
|
||||
dedupeRename(remote, objs)
|
||||
case DeduplicateSkip:
|
||||
// skip
|
||||
default:
|
||||
//skip
|
||||
}
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// listToChan will transfer all objects in the listing to the output
|
||||
//
|
||||
// If an error occurs, the error will be logged, and it will close the
|
||||
|
|
|
@ -37,7 +37,6 @@ import (
|
|||
"github.com/ncw/rclone/fs/hash"
|
||||
"github.com/ncw/rclone/fs/list"
|
||||
"github.com/ncw/rclone/fs/operations"
|
||||
"github.com/ncw/rclone/fs/walk"
|
||||
"github.com/ncw/rclone/fstest"
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
|
@ -301,170 +300,6 @@ func TestCheckSizeOnly(t *testing.T) {
|
|||
TestCheck(t)
|
||||
}
|
||||
|
||||
func skipIfCantDedupe(t *testing.T, f fs.Fs) {
|
||||
if f.Features().PutUnchecked == nil {
|
||||
t.Skip("Can't test deduplicate - no PutUnchecked")
|
||||
}
|
||||
if !f.Features().DuplicateFiles {
|
||||
t.Skip("Can't test deduplicate - no duplicate files possible")
|
||||
}
|
||||
if !f.Hashes().Contains(hash.MD5) {
|
||||
t.Skip("Can't test deduplicate - MD5 not supported")
|
||||
}
|
||||
}
|
||||
|
||||
func TestDeduplicateInteractive(t *testing.T) {
|
||||
r := fstest.NewRun(t)
|
||||
defer r.Finalise()
|
||||
skipIfCantDedupe(t, r.Fremote)
|
||||
|
||||
file1 := r.WriteUncheckedObject("one", "This is one", t1)
|
||||
file2 := r.WriteUncheckedObject("one", "This is one", t1)
|
||||
file3 := r.WriteUncheckedObject("one", "This is one", t1)
|
||||
r.CheckWithDuplicates(t, file1, file2, file3)
|
||||
|
||||
err := operations.Deduplicate(r.Fremote, operations.DeduplicateInteractive)
|
||||
require.NoError(t, err)
|
||||
|
||||
fstest.CheckItems(t, r.Fremote, file1)
|
||||
}
|
||||
|
||||
func TestDeduplicateSkip(t *testing.T) {
|
||||
r := fstest.NewRun(t)
|
||||
defer r.Finalise()
|
||||
skipIfCantDedupe(t, r.Fremote)
|
||||
|
||||
file1 := r.WriteUncheckedObject("one", "This is one", t1)
|
||||
file2 := r.WriteUncheckedObject("one", "This is one", t1)
|
||||
file3 := r.WriteUncheckedObject("one", "This is another one", t1)
|
||||
r.CheckWithDuplicates(t, file1, file2, file3)
|
||||
|
||||
err := operations.Deduplicate(r.Fremote, operations.DeduplicateSkip)
|
||||
require.NoError(t, err)
|
||||
|
||||
r.CheckWithDuplicates(t, file1, file3)
|
||||
}
|
||||
|
||||
func TestDeduplicateFirst(t *testing.T) {
|
||||
r := fstest.NewRun(t)
|
||||
defer r.Finalise()
|
||||
skipIfCantDedupe(t, r.Fremote)
|
||||
|
||||
file1 := r.WriteUncheckedObject("one", "This is one", t1)
|
||||
file2 := r.WriteUncheckedObject("one", "This is one A", t1)
|
||||
file3 := r.WriteUncheckedObject("one", "This is one BB", t1)
|
||||
r.CheckWithDuplicates(t, file1, file2, file3)
|
||||
|
||||
err := operations.Deduplicate(r.Fremote, operations.DeduplicateFirst)
|
||||
require.NoError(t, err)
|
||||
|
||||
objects, size, err := operations.Count(r.Fremote)
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, int64(1), objects)
|
||||
if size != file1.Size && size != file2.Size && size != file3.Size {
|
||||
t.Errorf("Size not one of the object sizes %d", size)
|
||||
}
|
||||
}
|
||||
|
||||
func TestDeduplicateNewest(t *testing.T) {
|
||||
r := fstest.NewRun(t)
|
||||
defer r.Finalise()
|
||||
skipIfCantDedupe(t, r.Fremote)
|
||||
|
||||
file1 := r.WriteUncheckedObject("one", "This is one", t1)
|
||||
file2 := r.WriteUncheckedObject("one", "This is one too", t2)
|
||||
file3 := r.WriteUncheckedObject("one", "This is another one", t3)
|
||||
r.CheckWithDuplicates(t, file1, file2, file3)
|
||||
|
||||
err := operations.Deduplicate(r.Fremote, operations.DeduplicateNewest)
|
||||
require.NoError(t, err)
|
||||
|
||||
fstest.CheckItems(t, r.Fremote, file3)
|
||||
}
|
||||
|
||||
func TestDeduplicateOldest(t *testing.T) {
|
||||
r := fstest.NewRun(t)
|
||||
defer r.Finalise()
|
||||
skipIfCantDedupe(t, r.Fremote)
|
||||
|
||||
file1 := r.WriteUncheckedObject("one", "This is one", t1)
|
||||
file2 := r.WriteUncheckedObject("one", "This is one too", t2)
|
||||
file3 := r.WriteUncheckedObject("one", "This is another one", t3)
|
||||
r.CheckWithDuplicates(t, file1, file2, file3)
|
||||
|
||||
err := operations.Deduplicate(r.Fremote, operations.DeduplicateOldest)
|
||||
require.NoError(t, err)
|
||||
|
||||
fstest.CheckItems(t, r.Fremote, file1)
|
||||
}
|
||||
|
||||
func TestDeduplicateRename(t *testing.T) {
|
||||
r := fstest.NewRun(t)
|
||||
defer r.Finalise()
|
||||
skipIfCantDedupe(t, r.Fremote)
|
||||
|
||||
file1 := r.WriteUncheckedObject("one.txt", "This is one", t1)
|
||||
file2 := r.WriteUncheckedObject("one.txt", "This is one too", t2)
|
||||
file3 := r.WriteUncheckedObject("one.txt", "This is another one", t3)
|
||||
r.CheckWithDuplicates(t, file1, file2, file3)
|
||||
|
||||
err := operations.Deduplicate(r.Fremote, operations.DeduplicateRename)
|
||||
require.NoError(t, err)
|
||||
|
||||
require.NoError(t, walk.Walk(r.Fremote, "", true, -1, func(dirPath string, entries fs.DirEntries, err error) error {
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
entries.ForObject(func(o fs.Object) {
|
||||
remote := o.Remote()
|
||||
if remote != "one-1.txt" &&
|
||||
remote != "one-2.txt" &&
|
||||
remote != "one-3.txt" {
|
||||
t.Errorf("Bad file name after rename %q", remote)
|
||||
}
|
||||
size := o.Size()
|
||||
if size != file1.Size && size != file2.Size && size != file3.Size {
|
||||
t.Errorf("Size not one of the object sizes %d", size)
|
||||
}
|
||||
})
|
||||
return nil
|
||||
}))
|
||||
}
|
||||
|
||||
// This should really be a unit test, but the test framework there
|
||||
// doesn't have enough tools to make it easy
|
||||
func TestMergeDirs(t *testing.T) {
|
||||
r := fstest.NewRun(t)
|
||||
defer r.Finalise()
|
||||
|
||||
mergeDirs := r.Fremote.Features().MergeDirs
|
||||
if mergeDirs == nil {
|
||||
t.Skip("Can't merge directories")
|
||||
}
|
||||
|
||||
file1 := r.WriteObject("dupe1/one.txt", "This is one", t1)
|
||||
file2 := r.WriteObject("dupe2/two.txt", "This is one too", t2)
|
||||
file3 := r.WriteObject("dupe3/three.txt", "This is another one", t3)
|
||||
|
||||
objs, dirs, err := walk.GetAll(r.Fremote, "", true, 1)
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, 3, len(dirs))
|
||||
assert.Equal(t, 0, len(objs))
|
||||
|
||||
err = mergeDirs(dirs)
|
||||
require.NoError(t, err)
|
||||
|
||||
file2.Path = "dupe1/two.txt"
|
||||
file3.Path = "dupe1/three.txt"
|
||||
fstest.CheckItems(t, r.Fremote, file1, file2, file3)
|
||||
|
||||
objs, dirs, err = walk.GetAll(r.Fremote, "", true, 1)
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, 1, len(dirs))
|
||||
assert.Equal(t, 0, len(objs))
|
||||
assert.Equal(t, "dupe1", dirs[0].Remote())
|
||||
}
|
||||
|
||||
func TestCat(t *testing.T) {
|
||||
r := fstest.NewRun(t)
|
||||
defer r.Finalise()
|
||||
|
|
Loading…
Reference in a new issue