Merge pull request #3786 from greatroar/prune

restic prune: Merge three loops over the index
2022-06-18 16:54:50 +02:00 · 2022-06-18 16:54:50 +02:00 · 19581dbc18
commit 19581dbc18
parent 2c893fe43c 8bdfcf779f
4 changed files with 42 additions and 59 deletions
--- a/cmd/restic/cmd_prune.go
+++ b/cmd/restic/cmd_prune.go
@ -242,11 +242,26 @@ func prune(opts PruneOptions, gopts GlobalOptions, repo restic.Repository, usedB

 	Verbosef("searching used packs...\n")

+	indexPack := make(map[restic.ID]packInfo)
 	keepBlobs := restic.NewBlobSet()
-	duplicateBlobs := restic.NewBlobSet()

-	// iterate over all blobs in index to find out which blobs are duplicates
+	// iterate over all blobs in index to generate packInfo and find duplicates
 	for blob := range repo.Index().Each(ctx) {
+		ip, seen := indexPack[blob.PackID]
+
+		if seen {
+			// mark mixed packs with "Invalid blob type"
+			if ip.tpe != blob.Type {
+				ip.tpe = restic.InvalidBlob
+			}
+		} else {
+			ip = packInfo{
+				tpe:      blob.Type,
+				usedSize: pack.HeaderSize,
+			}
+		}
+		ip.usedSize += uint64(pack.CalculateEntrySize(blob.Blob))
+
 		bh := blob.BlobHandle
 		size := uint64(blob.Length)
 		switch {
@ -255,14 +270,27 @@ func prune(opts PruneOptions, gopts GlobalOptions, repo restic.Repository, usedB
 			keepBlobs.Insert(bh)
 			stats.size.used += size
 			stats.blobs.used++
-		case keepBlobs.Has(bh): // duplicate blob
-			duplicateBlobs.Insert(bh)
+			ip.usedSize += size
+			ip.usedBlobs++
+
+		case keepBlobs.Has(bh): // duplicate of a blob that we want to keep
 			stats.size.duplicate += size
 			stats.blobs.duplicate++
-		default:
+			ip.usedSize += size
+			ip.duplicateBlobs++
+
+		default: // unused, don't care if it's a duplicate
 			stats.size.unused += size
 			stats.blobs.unused++
+			ip.unusedSize += size
+			ip.unusedBlobs++
 		}
+
+		if !blob.IsCompressed() {
+			ip.uncompressed = true
+		}
+		// update indexPack
+		indexPack[blob.PackID] = ip
 	}

 	// Check if all used blobs have been found in index
@ -275,48 +303,6 @@ func prune(opts PruneOptions, gopts GlobalOptions, repo restic.Repository, usedB
 		return errorIndexIncomplete
 	}

-	indexPack := make(map[restic.ID]packInfo)
-
-	// save computed pack header size
-	for pid, hdrSize := range pack.Size(ctx, repo.Index(), true) {
-		// initialize tpe with NumBlobTypes to indicate it's not set
-		indexPack[pid] = packInfo{tpe: restic.NumBlobTypes, usedSize: uint64(hdrSize)}
-	}
-
-	// iterate over all blobs in index to generate packInfo
-	for blob := range repo.Index().Each(ctx) {
-		ip := indexPack[blob.PackID]
-
-		// Set blob type if not yet set
-		if ip.tpe == restic.NumBlobTypes {
-			ip.tpe = blob.Type
-		}
-
-		// mark mixed packs with "Invalid blob type"
-		if ip.tpe != blob.Type {
-			ip.tpe = restic.InvalidBlob
-		}
-
-		bh := blob.BlobHandle
-		size := uint64(blob.Length)
-		switch {
-		case duplicateBlobs.Has(bh): // duplicate blob
-			ip.usedSize += size
-			ip.duplicateBlobs++
-		case keepBlobs.Has(bh): // used blob, not duplicate
-			ip.usedSize += size
-			ip.usedBlobs++
-		default: // unused blob
-			ip.unusedSize += size
-			ip.unusedBlobs++
-		}
-		if !blob.IsCompressed() {
-			ip.uncompressed = true
-		}
-		// update indexPack
-		indexPack[blob.PackID] = ip
-	}
-
 	Verbosef("collecting packs for deletion and repacking\n")
 	removePacksFirst := restic.NewIDSet()
 	removePacks := restic.NewIDSet()
--- a/cmd/restic/cmd_rebuild_index.go
+++ b/cmd/restic/cmd_rebuild_index.go
@ -98,7 +98,7 @@ func rebuildIndex(opts RebuildIndexOptions, gopts GlobalOptions, repo *repositor
 		if err != nil {
 			return err
 		}
-		packSizeFromIndex = pack.Size(ctx, repo.Index(), false)
+		packSizeFromIndex = pack.Size(ctx, repo.Index())
 	}

 	Verbosef("getting pack files to read...\n")
--- a/internal/checker/checker.go
+++ b/internal/checker/checker.go
@ -131,7 +131,7 @@ func (c *Checker) LoadIndex(ctx context.Context) (hints []error, errs []error) {
 	}

 	// compute pack size using index entries
-	c.packs = pack.Size(ctx, c.masterIndex, false)
+	c.packs = pack.Size(ctx, c.masterIndex)

 	debug.Log("checking for duplicate packs")
 	for packID := range c.packs {
--- a/internal/pack/pack.go
+++ b/internal/pack/pack.go
@ -177,8 +177,8 @@ var (
 const (
 	// size of the header-length field at the end of the file; it is a uint32
 	headerLengthSize = 4
-	// headerSize is the header's constant overhead (independent of #entries)
-	headerSize = headerLengthSize + crypto.Extension
+	// HeaderSize is the header's constant overhead (independent of #entries)
+	HeaderSize = headerLengthSize + crypto.Extension

 	// MaxHeaderSize is the max size of header including header-length field
 	MaxHeaderSize = 16*1024*1024 + headerLengthSize
@ -242,7 +242,7 @@ func readHeader(rd io.ReaderAt, size int64) ([]byte, error) {
 	// eagerly download eagerEntries header entries as part of header-length request.
 	// only make second request if actual number of entries is greater than eagerEntries

-	eagerSize := eagerEntries*int(entrySize) + headerSize
+	eagerSize := eagerEntries*int(entrySize) + HeaderSize
 	b, c, err := readRecords(rd, size, eagerSize)
 	if err != nil {
 		return nil, err
@ -349,7 +349,7 @@ func CalculateEntrySize(blob restic.Blob) int {
 }

 func CalculateHeaderSize(blobs []restic.Blob) int {
-	size := headerSize
+	size := HeaderSize
 	for _, blob := range blobs {
 		size += CalculateEntrySize(blob)
 	}
@ -357,20 +357,17 @@ func CalculateHeaderSize(blobs []restic.Blob) int {
 }

 // Size returns the size of all packs computed by index information.
-// If onlyHdr is set to true, only the size of the header is returned
 // Note that this function only gives correct sizes, if there are no
 // duplicates in the index.
-func Size(ctx context.Context, mi restic.MasterIndex, onlyHdr bool) map[restic.ID]int64 {
+func Size(ctx context.Context, mi restic.MasterIndex) map[restic.ID]int64 {
 	packSize := make(map[restic.ID]int64)

 	for blob := range mi.Each(ctx) {
 		size, ok := packSize[blob.PackID]
 		if !ok {
-			size = headerSize
-		}
-		if !onlyHdr {
-			size += int64(blob.Length)
+			size = HeaderSize
 		}
+		size += int64(blob.Length)
 		packSize[blob.PackID] = size + int64(CalculateEntrySize(blob.Blob))
 	}