dedupe: implement keep smallest too

This is to help deduping google docs and their exported versions if
they accidentally get uploaded to the source again.

See: https://forum.rclone.org/t/my-stupidity-or-a-bug/13861
This commit is contained in:
Nick Craig-Wood 2020-01-16 13:47:15 +00:00
parent 1bd9f522e0
commit 81002747c5
3 changed files with 43 additions and 20 deletions

View file

@ -94,6 +94,7 @@ Dedupe can be run non interactively using the ` + "`" + `--dedupe-mode` + "`" +
* ` + "`" + `--dedupe-mode newest` + "`" + ` - removes identical files then keeps the newest one.
* ` + "`" + `--dedupe-mode oldest` + "`" + ` - removes identical files then keeps the oldest one.
* ` + "`" + `--dedupe-mode largest` + "`" + ` - removes identical files then keeps the largest one.
* ` + "`" + `--dedupe-mode smallest` + "`" + ` - removes identical files then keeps the smallest one.
* ` + "`" + `--dedupe-mode rename` + "`" + ` - removes identical files then renames the rest to be different.
For example to rename all the identically named photos in your Google Photos directory, do

View file

@ -125,14 +125,6 @@ func dedupeInteractive(ctx context.Context, f fs.Fs, ht hash.Type, remote string
}
}
type objectsSortedByModTime []fs.Object
func (objs objectsSortedByModTime) Len() int { return len(objs) }
func (objs objectsSortedByModTime) Swap(i, j int) { objs[i], objs[j] = objs[j], objs[i] }
func (objs objectsSortedByModTime) Less(i, j int) bool {
return objs[i].ModTime(context.TODO()).Before(objs[j].ModTime(context.TODO()))
}
// DeduplicateMode is how the dedupe command chooses what to do
type DeduplicateMode int
@ -145,6 +137,7 @@ const (
DeduplicateOldest // choose the oldest object
DeduplicateRename // rename the objects
DeduplicateLargest // choose the largest object
DeduplicateSmallest // choose the smallest object
)
func (x DeduplicateMode) String() string {
@ -163,6 +156,8 @@ func (x DeduplicateMode) String() string {
return "rename"
case DeduplicateLargest:
return "largest"
case DeduplicateSmallest:
return "smallest"
}
return "unknown"
}
@ -184,6 +179,8 @@ func (x *DeduplicateMode) Set(s string) error {
*x = DeduplicateRename
case "largest":
*x = DeduplicateLargest
case "smallest":
*x = DeduplicateSmallest
default:
return errors.Errorf("Unknown mode for dedupe %q.", s)
}
@ -248,6 +245,20 @@ func dedupeMergeDuplicateDirs(ctx context.Context, f fs.Fs, duplicateDirs [][]fs
return nil
}
// sort oldest first
func sortOldestFirst(objs []fs.Object) {
sort.Slice(objs, func(i, j int) bool {
return objs[i].ModTime(context.TODO()).Before(objs[j].ModTime(context.TODO()))
})
}
// sort smallest first
func sortSmallestFirst(objs []fs.Object) {
sort.Slice(objs, func(i, j int) bool {
return objs[i].Size() < objs[j].Size()
})
}
// Deduplicate interactively finds duplicate files and offers to
// delete all but one or rename them to be different. Only useful with
// Google Drive which can have duplicate file names.
@ -296,24 +307,19 @@ func Deduplicate(ctx context.Context, f fs.Fs, mode DeduplicateMode) error {
case DeduplicateFirst:
dedupeDeleteAllButOne(ctx, 0, remote, objs)
case DeduplicateNewest:
sort.Sort(objectsSortedByModTime(objs)) // sort oldest first
sortOldestFirst(objs)
dedupeDeleteAllButOne(ctx, len(objs)-1, remote, objs)
case DeduplicateOldest:
sort.Sort(objectsSortedByModTime(objs)) // sort oldest first
sortOldestFirst(objs)
dedupeDeleteAllButOne(ctx, 0, remote, objs)
case DeduplicateRename:
dedupeRename(ctx, f, remote, objs)
case DeduplicateLargest:
largest, largestIndex := int64(-1), -1
for i, obj := range objs {
size := obj.Size()
if size > largest {
largest, largestIndex = size, i
}
}
if largestIndex > -1 {
dedupeDeleteAllButOne(ctx, largestIndex, remote, objs)
}
sortSmallestFirst(objs)
dedupeDeleteAllButOne(ctx, len(objs)-1, remote, objs)
case DeduplicateSmallest:
sortSmallestFirst(objs)
dedupeDeleteAllButOne(ctx, 0, remote, objs)
case DeduplicateSkip:
// skip
default:

View file

@ -152,6 +152,22 @@ func TestDeduplicateLargest(t *testing.T) {
fstest.CheckItems(t, r.Fremote, file3)
}
func TestDeduplicateSmallest(t *testing.T) {
r := fstest.NewRun(t)
defer r.Finalise()
skipIfCantDedupe(t, r.Fremote)
file1 := r.WriteUncheckedObject(context.Background(), "one", "This is one", t1)
file2 := r.WriteUncheckedObject(context.Background(), "one", "This is one too", t2)
file3 := r.WriteUncheckedObject(context.Background(), "one", "This is another one", t3)
r.CheckWithDuplicates(t, file1, file2, file3)
err := operations.Deduplicate(context.Background(), r.Fremote, operations.DeduplicateSmallest)
require.NoError(t, err)
fstest.CheckItems(t, r.Fremote, file1)
}
func TestDeduplicateRename(t *testing.T) {
r := fstest.NewRun(t)
defer r.Finalise()