Merge pull request #3290 from aawsome/prune-handle-duplicates

prune: Handle duplicate blobs more efficiently
2022-07-17 11:51:54 +02:00 · 2022-07-17 11:51:54 +02:00 · d9ea1e9ee2
commit d9ea1e9ee2
parent 6cbeb4a9f9 715d457aad
2 changed files with 94 additions and 44 deletions
--- a/changelog/unreleased/issue-3114
+++ b/changelog/unreleased/issue-3114
@ -0,0 +1,12 @@
+Enhancement: Optimize handling of duplicate blobs in `prune`
+
+Restic `prune` always used to repack all data files containing duplicate
+blobs. This effectively removed all duplicates during prune. However, as a
+consequence all these data files were repacked even if the unused repository
+space threshold could be reached with less work.
+
+This is now changed and `prune` works nice and fast also if there are lots
+of duplicates.
+
+https://github.com/restic/restic/issues/3114
+https://github.com/restic/restic/pull/3290
--- a/cmd/restic/cmd_prune.go
+++ b/cmd/restic/cmd_prune.go
@ -195,13 +195,12 @@ func runPruneWithRepo(opts PruneOptions, gopts GlobalOptions, repo *repository.R
 }

 type packInfo struct {
-	usedBlobs      uint
-	unusedBlobs    uint
-	duplicateBlobs uint
-	usedSize       uint64
-	unusedSize     uint64
-	tpe            restic.BlobType
-	uncompressed   bool
+	usedBlobs    uint
+	unusedBlobs  uint
+	usedSize     uint64
+	unusedSize   uint64
+	tpe          restic.BlobType
+	uncompressed bool
 }

 type packInfoWithID struct {
@ -243,7 +242,7 @@ func prune(opts PruneOptions, gopts GlobalOptions, repo restic.Repository, usedB
 	Verbosef("searching used packs...\n")

 	keepBlobs := restic.NewBlobSet()
-	duplicateBlobs := restic.NewBlobSet()
+	duplicateBlobs := make(map[restic.BlobHandle]uint8)

 	// iterate over all blobs in index to find out which blobs are duplicates
 	for blob := range repo.Index().Each(ctx) {
@ -256,7 +255,16 @@ func prune(opts PruneOptions, gopts GlobalOptions, repo restic.Repository, usedB
 			stats.size.used += size
 			stats.blobs.used++
 		case keepBlobs.Has(bh): // duplicate blob
-			duplicateBlobs.Insert(bh)
+			count, ok := duplicateBlobs[bh]
+			if !ok {
+				count = 2 // this one is already the second blob!
+			} else if count < math.MaxUint8 {
+				// don't overflow, but saturate count at 255
+				// this can lead to a non-optimal pack selection, but won't cause
+				// problems otherwise
+				count++
+			}
+			duplicateBlobs[bh] = count
 			stats.size.duplicate += size
 			stats.blobs.duplicate++
 		default:
@ -299,10 +307,9 @@ func prune(opts PruneOptions, gopts GlobalOptions, repo restic.Repository, usedB

 		bh := blob.BlobHandle
 		size := uint64(blob.Length)
+		_, isDuplicate := duplicateBlobs[bh]
 		switch {
-		case duplicateBlobs.Has(bh): // duplicate blob
-			ip.usedSize += size
-			ip.duplicateBlobs++
+		case isDuplicate: // duplicate blobs will be handled later
 		case keepBlobs.Has(bh): // used blob, not duplicate
 			ip.usedSize += size
 			ip.usedBlobs++
@ -317,21 +324,49 @@ func prune(opts PruneOptions, gopts GlobalOptions, repo restic.Repository, usedB
 		indexPack[blob.PackID] = ip
 	}

+	// if duplicate blobs exist, those will be set to either "used" or "unused":
+	// - mark only one occurence of duplicate blobs as used
+	// - if there are already some used blobs in a pack, possibly mark duplicates in this pack as "used"
+	// - if there are no used blobs in a pack, possibly mark duplicates as "unused"
+	if len(duplicateBlobs) > 0 {
+		// iterate again over all blobs in index (this is pretty cheap, all in-mem)
+		for blob := range repo.Index().Each(ctx) {
+			bh := blob.BlobHandle
+			count, isDuplicate := duplicateBlobs[bh]
+			if !isDuplicate {
+				continue
+			}
+
+			ip := indexPack[blob.PackID]
+			size := uint64(blob.Length)
+			switch {
+			case count == 0:
+				// used duplicate exists ->  mark as unused
+				ip.unusedSize += size
+				ip.unusedBlobs++
+			case ip.usedBlobs > 0, count == 1:
+				// other used blobs in pack or "last" occurency ->  mark as used
+				ip.usedSize += size
+				ip.usedBlobs++
+				// let other occurences be marked as unused
+				duplicateBlobs[bh] = 0
+			default:
+				// mark as unused and decrease counter
+				ip.unusedSize += size
+				ip.unusedBlobs++
+				duplicateBlobs[bh] = count - 1
+			}
+			// update indexPack
+			indexPack[blob.PackID] = ip
+		}
+	}
+
 	Verbosef("collecting packs for deletion and repacking\n")
 	removePacksFirst := restic.NewIDSet()
 	removePacks := restic.NewIDSet()
 	repackPacks := restic.NewIDSet()

 	var repackCandidates []packInfoWithID
-	repackAllPacksWithDuplicates := true
-
-	keep := func(p packInfo) {
-		stats.packs.keep++
-		if p.duplicateBlobs > 0 {
-			repackAllPacksWithDuplicates = false
-		}
-	}
-
 	repoVersion := repo.Config().Version

 	// loop over all packs and decide what to do
@ -346,8 +381,7 @@ func prune(opts PruneOptions, gopts GlobalOptions, repo restic.Repository, usedB
 			return nil
 		}

-		if p.unusedSize+p.usedSize != uint64(packSize) &&
-			!(p.usedBlobs == 0 && p.duplicateBlobs == 0) {
+		if p.unusedSize+p.usedSize != uint64(packSize) && p.usedBlobs != 0 {
 			// Pack size does not fit and pack is needed => error
 			// If the pack is not needed, this is no error, the pack can
 			// and will be simply removed, see below.
@ -358,7 +392,7 @@ func prune(opts PruneOptions, gopts GlobalOptions, repo restic.Repository, usedB

 		// statistics
 		switch {
-		case p.usedBlobs == 0 && p.duplicateBlobs == 0:
+		case p.usedBlobs == 0:
 			stats.packs.unused++
 		case p.unusedBlobs == 0:
 			stats.packs.used++
@ -377,7 +411,7 @@ func prune(opts PruneOptions, gopts GlobalOptions, repo restic.Repository, usedB

 		// decide what to do
 		switch {
-		case p.usedBlobs == 0 && p.duplicateBlobs == 0:
+		case p.usedBlobs == 0:
 			// All blobs in pack are no longer used => remove pack!
 			removePacks.Insert(id)
 			stats.blobs.remove += p.unusedBlobs
@ -385,11 +419,11 @@ func prune(opts PruneOptions, gopts GlobalOptions, repo restic.Repository, usedB

 		case opts.RepackCachableOnly && p.tpe == restic.DataBlob:
 			// if this is a data pack and --repack-cacheable-only is set => keep pack!
-			keep(p)
+			stats.packs.keep++

-		case p.unusedBlobs == 0 && p.duplicateBlobs == 0 && p.tpe != restic.InvalidBlob && !mustCompress:
-			// All blobs in pack are used and not duplicates/mixed => keep pack!
-			keep(p)
+		case p.unusedBlobs == 0 && p.tpe != restic.InvalidBlob && !mustCompress:
+			// All blobs in pack are used and not mixed => keep pack!
+			stats.packs.keep++

 		default:
 			// all other packs are candidates for repacking
@ -410,7 +444,7 @@ func prune(opts PruneOptions, gopts GlobalOptions, repo restic.Repository, usedB
 	// missing packs that are not needed can be ignored
 	ignorePacks := restic.NewIDSet()
 	for id, p := range indexPack {
-		if p.usedBlobs == 0 && p.duplicateBlobs == 0 {
+		if p.usedBlobs == 0 {
 			ignorePacks.Insert(id)
 			stats.blobs.remove += p.unusedBlobs
 			stats.size.remove += p.unusedSize
@ -439,15 +473,11 @@ func prune(opts PruneOptions, gopts GlobalOptions, repo restic.Repository, usedB
 	// This is equivalent to sorting by unused / total space.
 	// Instead of unused[i] / used[i] > unused[j] / used[j] we use
 	// unused[i] * used[j] > unused[j] * used[i] as uint32*uint32 < uint64
-	// Morover duplicates and packs containing trees are sorted to the beginning
+	// Morover packs containing trees are sorted to the beginning
 	sort.Slice(repackCandidates, func(i, j int) bool {
 		pi := repackCandidates[i].packInfo
 		pj := repackCandidates[j].packInfo
 		switch {
-		case pi.duplicateBlobs > 0 && pj.duplicateBlobs == 0:
-			return true
-		case pj.duplicateBlobs > 0 && pi.duplicateBlobs == 0:
-			return false
 		case pi.tpe != restic.DataBlob && pj.tpe == restic.DataBlob:
 			return true
 		case pj.tpe != restic.DataBlob && pi.tpe == restic.DataBlob:
@ -458,7 +488,7 @@ func prune(opts PruneOptions, gopts GlobalOptions, repo restic.Repository, usedB

 	repack := func(id restic.ID, p packInfo) {
 		repackPacks.Insert(id)
-		stats.blobs.repack += p.unusedBlobs + p.duplicateBlobs + p.usedBlobs
+		stats.blobs.repack += p.unusedBlobs + p.usedBlobs
 		stats.size.repack += p.unusedSize + p.usedSize
 		stats.blobs.repackrm += p.unusedBlobs
 		stats.size.repackrm += p.unusedSize
@ -470,25 +500,33 @@ func prune(opts PruneOptions, gopts GlobalOptions, repo restic.Repository, usedB

 		switch {
 		case reachedRepackSize:
-			keep(p.packInfo)
+			stats.packs.keep++

-		case p.duplicateBlobs > 0, p.tpe != restic.DataBlob, p.uncompressed:
-			// repacking duplicates/non-data/uncompressed-trees is only limited by repackSize
+		case p.tpe != restic.DataBlob, p.uncompressed:
+			// repacking non-data packs / uncompressed-trees is only limited by repackSize
 			repack(p.ID, p.packInfo)

 		case reachedUnusedSizeAfter:
 			// for all other packs stop repacking if tolerated unused size is reached.
-			keep(p.packInfo)
+			stats.packs.keep++

 		default:
 			repack(p.ID, p.packInfo)
 		}
 	}

-	// if all duplicates are repacked, print out correct statistics
-	if repackAllPacksWithDuplicates {
-		stats.blobs.repackrm += stats.blobs.duplicate
-		stats.size.repackrm += stats.size.duplicate
+	if len(repackPacks) != 0 {
+		// when repacking, we do not want to keep blobs which are
+		// already contained in kept packs, so delete them from keepBlobs
+		for blob := range repo.Index().Each(ctx) {
+			if removePacks.Has(blob.PackID) || repackPacks.Has(blob.PackID) {
+				continue
+			}
+			keepBlobs.Delete(blob.BlobHandle)
+		}
+	} else {
+		// keepBlobs is only needed if packs are repacked
+		keepBlobs = nil
 	}

 	Verboseff("\nused:         %10d blobs / %s\n", stats.blobs.used, formatBytes(stats.size.used))