rclone/fs/operations/dedupe.go
Nick Craig-Wood 019a486d5b accounting: Make checkers show what they are doing
Before this change, all types of checkers showed "checking" after the
file name despite the fact that not all of them were checking.

After this change, they can show

- checking
- deleting
- hashing
- importing
- listing
- merging
- moving
- renaming

See: https://forum.rclone.org/t/what-is-rclone-checking-during-a-purge/35931/
2023-03-01 11:10:38 +00:00

507 lines
13 KiB
Go

// dedupe - gets rid of identical files remotes which can have duplicate file names (drive, mega)
package operations
import (
"context"
"fmt"
"log"
"path"
"sort"
"strings"
"github.com/rclone/rclone/fs"
"github.com/rclone/rclone/fs/accounting"
"github.com/rclone/rclone/fs/config"
"github.com/rclone/rclone/fs/hash"
"github.com/rclone/rclone/fs/walk"
)
// dedupeRename renames the objs slice to different names
func dedupeRename(ctx context.Context, f fs.Fs, remote string, objs []fs.Object) {
doMove := f.Features().Move
if doMove == nil {
log.Fatalf("Fs %v doesn't support Move", f)
}
ext := path.Ext(remote)
base := remote[:len(remote)-len(ext)]
outer:
for i, o := range objs {
suffix := 1
newName := fmt.Sprintf("%s-%d%s", base, i+suffix, ext)
_, err := f.NewObject(ctx, newName)
for ; err != fs.ErrorObjectNotFound; suffix++ {
if err != nil {
err = fs.CountError(err)
fs.Errorf(o, "Failed to check for existing object: %v", err)
continue outer
}
if suffix > 100 {
fs.Errorf(o, "Could not find an available new name")
continue outer
}
newName = fmt.Sprintf("%s-%d%s", base, i+suffix, ext)
_, err = f.NewObject(ctx, newName)
}
if !SkipDestructive(ctx, o, "rename") {
newObj, err := doMove(ctx, o, newName)
if err != nil {
err = fs.CountError(err)
fs.Errorf(o, "Failed to rename: %v", err)
continue
}
fs.Infof(newObj, "renamed from: %v", o)
}
}
}
// dedupeDeleteAllButOne deletes all but the one in keep
func dedupeDeleteAllButOne(ctx context.Context, keep int, remote string, objs []fs.Object) {
count := 0
for i, o := range objs {
if i == keep {
continue
}
err := DeleteFile(ctx, o)
if err == nil {
count++
}
}
if count > 0 {
fs.Logf(remote, "Deleted %d extra copies", count)
}
}
// dedupeDeleteIdentical deletes all but one of identical (by hash) copies
func dedupeDeleteIdentical(ctx context.Context, ht hash.Type, remote string, objs []fs.Object) (remainingObjs []fs.Object) {
ci := fs.GetConfig(ctx)
// Make map of IDs
IDs := make(map[string]int, len(objs))
for _, o := range objs {
if do, ok := o.(fs.IDer); ok {
if ID := do.ID(); ID != "" {
IDs[ID]++
}
}
}
// Remove duplicate IDs
newObjs := objs[:0]
for _, o := range objs {
if do, ok := o.(fs.IDer); ok {
if ID := do.ID(); ID != "" {
if IDs[ID] <= 1 {
newObjs = append(newObjs, o)
} else {
fs.Logf(o, "Ignoring as it appears %d times in the listing and deleting would lead to data loss", IDs[ID])
}
}
}
}
objs = newObjs
// See how many of these duplicates are identical
dupesByID := make(map[string][]fs.Object, len(objs))
for _, o := range objs {
ID := ""
if ci.SizeOnly && o.Size() >= 0 {
ID = fmt.Sprintf("size %d", o.Size())
} else if ht != hash.None {
hashValue, err := o.Hash(ctx, ht)
if err == nil && hashValue != "" {
ID = fmt.Sprintf("%v %s", ht, hashValue)
}
}
if ID == "" {
remainingObjs = append(remainingObjs, o)
} else {
dupesByID[ID] = append(dupesByID[ID], o)
}
}
// Delete identical duplicates, filling remainingObjs with the ones remaining
for ID, dupes := range dupesByID {
remainingObjs = append(remainingObjs, dupes[0])
if len(dupes) > 1 {
fs.Logf(remote, "Deleting %d/%d identical duplicates (%s)", len(dupes)-1, len(dupes), ID)
for _, o := range dupes[1:] {
err := DeleteFile(ctx, o)
if err != nil {
remainingObjs = append(remainingObjs, o)
}
}
}
}
return remainingObjs
}
// dedupeList lists the duplicates and does nothing
func dedupeList(ctx context.Context, f fs.Fs, ht hash.Type, remote string, objs []fs.Object, byHash bool) {
fmt.Printf("%s: %d duplicates\n", remote, len(objs))
for i, o := range objs {
hashValue := ""
if ht != hash.None {
var err error
hashValue, err = o.Hash(ctx, ht)
if err != nil {
hashValue = err.Error()
}
}
if byHash {
fmt.Printf(" %d: %12d bytes, %s, %s\n", i+1, o.Size(), o.ModTime(ctx).Local().Format("2006-01-02 15:04:05.000000000"), o.Remote())
} else {
fmt.Printf(" %d: %12d bytes, %s, %v %32s\n", i+1, o.Size(), o.ModTime(ctx).Local().Format("2006-01-02 15:04:05.000000000"), ht, hashValue)
}
}
}
// dedupeInteractive interactively dedupes the slice of objects
func dedupeInteractive(ctx context.Context, f fs.Fs, ht hash.Type, remote string, objs []fs.Object, byHash bool) bool {
dedupeList(ctx, f, ht, remote, objs, byHash)
commands := []string{"sSkip and do nothing", "kKeep just one (choose which in next step)"}
if !byHash {
commands = append(commands, "rRename all to be different (by changing file.jpg to file-1.jpg)")
}
commands = append(commands, "qQuit")
switch config.Command(commands) {
case 's':
case 'k':
keep := config.ChooseNumber("Enter the number of the file to keep", 1, len(objs))
dedupeDeleteAllButOne(ctx, keep-1, remote, objs)
case 'r':
dedupeRename(ctx, f, remote, objs)
case 'q':
return false
}
return true
}
// DeduplicateMode is how the dedupe command chooses what to do
type DeduplicateMode int
// Deduplicate modes
const (
DeduplicateInteractive DeduplicateMode = iota // interactively ask the user
DeduplicateSkip // skip all conflicts
DeduplicateFirst // choose the first object
DeduplicateNewest // choose the newest object
DeduplicateOldest // choose the oldest object
DeduplicateRename // rename the objects
DeduplicateLargest // choose the largest object
DeduplicateSmallest // choose the smallest object
DeduplicateList // list duplicates only
)
func (x DeduplicateMode) String() string {
switch x {
case DeduplicateInteractive:
return "interactive"
case DeduplicateSkip:
return "skip"
case DeduplicateFirst:
return "first"
case DeduplicateNewest:
return "newest"
case DeduplicateOldest:
return "oldest"
case DeduplicateRename:
return "rename"
case DeduplicateLargest:
return "largest"
case DeduplicateSmallest:
return "smallest"
case DeduplicateList:
return "list"
}
return "unknown"
}
// Set a DeduplicateMode from a string
func (x *DeduplicateMode) Set(s string) error {
switch strings.ToLower(s) {
case "interactive":
*x = DeduplicateInteractive
case "skip":
*x = DeduplicateSkip
case "first":
*x = DeduplicateFirst
case "newest":
*x = DeduplicateNewest
case "oldest":
*x = DeduplicateOldest
case "rename":
*x = DeduplicateRename
case "largest":
*x = DeduplicateLargest
case "smallest":
*x = DeduplicateSmallest
case "list":
*x = DeduplicateList
default:
return fmt.Errorf("unknown mode for dedupe %q", s)
}
return nil
}
// Type of the value
func (x *DeduplicateMode) Type() string {
return "string"
}
// Directory with entry count and links to parents
type dedupeDir struct {
dir fs.Directory
parent string
count int
}
// Map of directories by ID with recursive counts
type dedupeDirsMap map[string]*dedupeDir
func (dm dedupeDirsMap) get(id string) *dedupeDir {
d := dm[id]
if d == nil {
d = &dedupeDir{}
dm[id] = d
}
return d
}
func (dm dedupeDirsMap) increment(parent string) {
if parent != "" {
d := dm.get(parent)
d.count++
dm.increment(d.parent)
}
}
// dedupeFindDuplicateDirs scans f for duplicate directories
func dedupeFindDuplicateDirs(ctx context.Context, f fs.Fs) (duplicateDirs [][]*dedupeDir, err error) {
dirsByID := dedupeDirsMap{}
dirs := map[string][]*dedupeDir{}
ci := fs.GetConfig(ctx)
err = walk.ListR(ctx, f, "", false, ci.MaxDepth, walk.ListAll, func(entries fs.DirEntries) error {
for _, entry := range entries {
tr := accounting.Stats(ctx).NewCheckingTransfer(entry, "merging")
remote := entry.Remote()
parentRemote := path.Dir(remote)
if parentRemote == "." {
parentRemote = ""
}
// Obtain ID of the object parent, if known.
// (This usually means that backend allows duplicate paths)
// Fall back to remote parent path, if unavailable.
var parent string
if entryParentIDer, ok := entry.(fs.ParentIDer); ok {
parent = entryParentIDer.ParentID()
}
if parent == "" {
parent = parentRemote
}
var ID string
if entryIDer, ok := entry.(fs.IDer); ok {
ID = entryIDer.ID()
}
if ID == "" {
ID = remote
}
if fsDir, ok := entry.(fs.Directory); ok {
d := dirsByID.get(ID)
d.dir = fsDir
d.parent = parent
dirs[remote] = append(dirs[remote], d)
}
dirsByID.increment(parent)
tr.Done(ctx, nil)
}
return nil
})
if err != nil {
return nil, fmt.Errorf("find duplicate dirs: %w", err)
}
// Make sure parents are before children
duplicateNames := []string{}
for name, ds := range dirs {
if len(ds) > 1 {
duplicateNames = append(duplicateNames, name)
}
}
sort.Strings(duplicateNames)
for _, name := range duplicateNames {
duplicateDirs = append(duplicateDirs, dirs[name])
}
return
}
// dedupeMergeDuplicateDirs merges all the duplicate directories found
func dedupeMergeDuplicateDirs(ctx context.Context, f fs.Fs, duplicateDirs [][]*dedupeDir) error {
mergeDirs := f.Features().MergeDirs
if mergeDirs == nil {
return fmt.Errorf("%v: can't merge directories", f)
}
dirCacheFlush := f.Features().DirCacheFlush
if dirCacheFlush == nil {
return fmt.Errorf("%v: can't flush dir cache", f)
}
for _, dedupeDirs := range duplicateDirs {
if SkipDestructive(ctx, dedupeDirs[0].dir, "merge duplicate directories") {
continue
}
// Put largest directory in front to minimize movements
fsDirs := []fs.Directory{}
largestCount := -1
largestIdx := 0
for i, d := range dedupeDirs {
fsDirs = append(fsDirs, d.dir)
if d.count > largestCount {
largestIdx = i
largestCount = d.count
}
}
fsDirs[largestIdx], fsDirs[0] = fsDirs[0], fsDirs[largestIdx]
fs.Infof(fsDirs[0], "Merging contents of duplicate directories")
err := mergeDirs(ctx, fsDirs)
if err != nil {
err = fs.CountError(err)
fs.Errorf(nil, "merge duplicate dirs: %v", err)
}
}
dirCacheFlush()
return nil
}
// sort oldest first
func sortOldestFirst(objs []fs.Object) {
sort.Slice(objs, func(i, j int) bool {
return objs[i].ModTime(context.TODO()).Before(objs[j].ModTime(context.TODO()))
})
}
// sort smallest first
func sortSmallestFirst(objs []fs.Object) {
sort.Slice(objs, func(i, j int) bool {
return objs[i].Size() < objs[j].Size()
})
}
// Deduplicate interactively finds duplicate files and offers to
// delete all but one or rename them to be different. Only useful with
// Google Drive which can have duplicate file names.
func Deduplicate(ctx context.Context, f fs.Fs, mode DeduplicateMode, byHash bool) error {
ci := fs.GetConfig(ctx)
// find a hash to use
ht := f.Hashes().GetOne()
what := "names"
if byHash {
if ht == hash.None {
return fmt.Errorf("%v has no hashes", f)
}
what = ht.String() + " hashes"
}
fs.Infof(f, "Looking for duplicate %s using %v mode.", what, mode)
// Find duplicate directories first and fix them
if !byHash {
duplicateDirs, err := dedupeFindDuplicateDirs(ctx, f)
if err != nil {
return err
}
if len(duplicateDirs) > 0 {
if mode != DeduplicateList {
err = dedupeMergeDuplicateDirs(ctx, f, duplicateDirs)
if err != nil {
return err
}
} else {
for _, dedupeDirs := range duplicateDirs {
remote := dedupeDirs[0].dir.Remote()
fmt.Printf("%s: %d duplicates of this directory\n", remote, len(dedupeDirs))
}
}
}
}
// Now find duplicate files
files := map[string][]fs.Object{}
err := walk.ListR(ctx, f, "", false, ci.MaxDepth, walk.ListObjects, func(entries fs.DirEntries) error {
entries.ForObject(func(o fs.Object) {
tr := accounting.Stats(ctx).NewCheckingTransfer(o, "checking")
defer tr.Done(ctx, nil)
var remote string
var err error
if byHash {
remote, err = o.Hash(ctx, ht)
if err != nil {
fs.Errorf(o, "Failed to hash: %v", err)
remote = ""
}
} else {
remote = o.Remote()
}
if remote != "" {
files[remote] = append(files[remote], o)
}
})
return nil
})
if err != nil {
return err
}
for remote, objs := range files {
if len(objs) <= 1 {
continue
}
fs.Logf(remote, "Found %d files with duplicate %s", len(objs), what)
if !byHash && mode != DeduplicateList {
objs = dedupeDeleteIdentical(ctx, ht, remote, objs)
if len(objs) <= 1 {
fs.Logf(remote, "All duplicates removed")
continue
}
}
switch mode {
case DeduplicateInteractive:
if !dedupeInteractive(ctx, f, ht, remote, objs, byHash) {
return nil
}
case DeduplicateFirst:
dedupeDeleteAllButOne(ctx, 0, remote, objs)
case DeduplicateNewest:
sortOldestFirst(objs)
dedupeDeleteAllButOne(ctx, len(objs)-1, remote, objs)
case DeduplicateOldest:
sortOldestFirst(objs)
dedupeDeleteAllButOne(ctx, 0, remote, objs)
case DeduplicateRename:
dedupeRename(ctx, f, remote, objs)
case DeduplicateLargest:
sortSmallestFirst(objs)
dedupeDeleteAllButOne(ctx, len(objs)-1, remote, objs)
case DeduplicateSmallest:
sortSmallestFirst(objs)
dedupeDeleteAllButOne(ctx, 0, remote, objs)
case DeduplicateSkip:
fs.Logf(remote, "Skipping %d files with duplicate %s", len(objs), what)
case DeduplicateList:
dedupeList(ctx, f, ht, remote, objs, byHash)
default:
//skip
}
}
return nil
}