Merge pull request #3786 from greatroar/prune

restic prune: Merge three loops over the index
This commit is contained in:
MichaelEischer 2022-06-18 16:54:50 +02:00 committed by GitHub
commit 19581dbc18
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 42 additions and 59 deletions

View file

@ -242,11 +242,26 @@ func prune(opts PruneOptions, gopts GlobalOptions, repo restic.Repository, usedB
Verbosef("searching used packs...\n")
indexPack := make(map[restic.ID]packInfo)
keepBlobs := restic.NewBlobSet()
duplicateBlobs := restic.NewBlobSet()
// iterate over all blobs in index to find out which blobs are duplicates
// iterate over all blobs in index to generate packInfo and find duplicates
for blob := range repo.Index().Each(ctx) {
ip, seen := indexPack[blob.PackID]
if seen {
// mark mixed packs with "Invalid blob type"
if ip.tpe != blob.Type {
ip.tpe = restic.InvalidBlob
}
} else {
ip = packInfo{
tpe: blob.Type,
usedSize: pack.HeaderSize,
}
}
ip.usedSize += uint64(pack.CalculateEntrySize(blob.Blob))
bh := blob.BlobHandle
size := uint64(blob.Length)
switch {
@ -255,14 +270,27 @@ func prune(opts PruneOptions, gopts GlobalOptions, repo restic.Repository, usedB
keepBlobs.Insert(bh)
stats.size.used += size
stats.blobs.used++
case keepBlobs.Has(bh): // duplicate blob
duplicateBlobs.Insert(bh)
ip.usedSize += size
ip.usedBlobs++
case keepBlobs.Has(bh): // duplicate of a blob that we want to keep
stats.size.duplicate += size
stats.blobs.duplicate++
default:
ip.usedSize += size
ip.duplicateBlobs++
default: // unused, don't care if it's a duplicate
stats.size.unused += size
stats.blobs.unused++
ip.unusedSize += size
ip.unusedBlobs++
}
if !blob.IsCompressed() {
ip.uncompressed = true
}
// update indexPack
indexPack[blob.PackID] = ip
}
// Check if all used blobs have been found in index
@ -275,48 +303,6 @@ func prune(opts PruneOptions, gopts GlobalOptions, repo restic.Repository, usedB
return errorIndexIncomplete
}
indexPack := make(map[restic.ID]packInfo)
// save computed pack header size
for pid, hdrSize := range pack.Size(ctx, repo.Index(), true) {
// initialize tpe with NumBlobTypes to indicate it's not set
indexPack[pid] = packInfo{tpe: restic.NumBlobTypes, usedSize: uint64(hdrSize)}
}
// iterate over all blobs in index to generate packInfo
for blob := range repo.Index().Each(ctx) {
ip := indexPack[blob.PackID]
// Set blob type if not yet set
if ip.tpe == restic.NumBlobTypes {
ip.tpe = blob.Type
}
// mark mixed packs with "Invalid blob type"
if ip.tpe != blob.Type {
ip.tpe = restic.InvalidBlob
}
bh := blob.BlobHandle
size := uint64(blob.Length)
switch {
case duplicateBlobs.Has(bh): // duplicate blob
ip.usedSize += size
ip.duplicateBlobs++
case keepBlobs.Has(bh): // used blob, not duplicate
ip.usedSize += size
ip.usedBlobs++
default: // unused blob
ip.unusedSize += size
ip.unusedBlobs++
}
if !blob.IsCompressed() {
ip.uncompressed = true
}
// update indexPack
indexPack[blob.PackID] = ip
}
Verbosef("collecting packs for deletion and repacking\n")
removePacksFirst := restic.NewIDSet()
removePacks := restic.NewIDSet()

View file

@ -98,7 +98,7 @@ func rebuildIndex(opts RebuildIndexOptions, gopts GlobalOptions, repo *repositor
if err != nil {
return err
}
packSizeFromIndex = pack.Size(ctx, repo.Index(), false)
packSizeFromIndex = pack.Size(ctx, repo.Index())
}
Verbosef("getting pack files to read...\n")

View file

@ -131,7 +131,7 @@ func (c *Checker) LoadIndex(ctx context.Context) (hints []error, errs []error) {
}
// compute pack size using index entries
c.packs = pack.Size(ctx, c.masterIndex, false)
c.packs = pack.Size(ctx, c.masterIndex)
debug.Log("checking for duplicate packs")
for packID := range c.packs {

View file

@ -177,8 +177,8 @@ var (
const (
// size of the header-length field at the end of the file; it is a uint32
headerLengthSize = 4
// headerSize is the header's constant overhead (independent of #entries)
headerSize = headerLengthSize + crypto.Extension
// HeaderSize is the header's constant overhead (independent of #entries)
HeaderSize = headerLengthSize + crypto.Extension
// MaxHeaderSize is the max size of header including header-length field
MaxHeaderSize = 16*1024*1024 + headerLengthSize
@ -242,7 +242,7 @@ func readHeader(rd io.ReaderAt, size int64) ([]byte, error) {
// eagerly download eagerEntries header entries as part of header-length request.
// only make second request if actual number of entries is greater than eagerEntries
eagerSize := eagerEntries*int(entrySize) + headerSize
eagerSize := eagerEntries*int(entrySize) + HeaderSize
b, c, err := readRecords(rd, size, eagerSize)
if err != nil {
return nil, err
@ -349,7 +349,7 @@ func CalculateEntrySize(blob restic.Blob) int {
}
func CalculateHeaderSize(blobs []restic.Blob) int {
size := headerSize
size := HeaderSize
for _, blob := range blobs {
size += CalculateEntrySize(blob)
}
@ -357,20 +357,17 @@ func CalculateHeaderSize(blobs []restic.Blob) int {
}
// Size returns the size of all packs computed by index information.
// If onlyHdr is set to true, only the size of the header is returned
// Note that this function only gives correct sizes, if there are no
// duplicates in the index.
func Size(ctx context.Context, mi restic.MasterIndex, onlyHdr bool) map[restic.ID]int64 {
func Size(ctx context.Context, mi restic.MasterIndex) map[restic.ID]int64 {
packSize := make(map[restic.ID]int64)
for blob := range mi.Each(ctx) {
size, ok := packSize[blob.PackID]
if !ok {
size = headerSize
}
if !onlyHdr {
size += int64(blob.Length)
size = HeaderSize
}
size += int64(blob.Length)
packSize[blob.PackID] = size + int64(CalculateEntrySize(blob.Blob))
}