Merge pull request #3101 from aawsome/packsizes

Compute packsizes in MasterIndex
This commit is contained in:
Alexander Neumann 2020-11-22 15:49:19 +01:00 committed by GitHub
commit c844580e0f
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
10 changed files with 86 additions and 72 deletions

View file

@ -8,7 +8,6 @@ import (
"github.com/restic/restic/internal/debug"
"github.com/restic/restic/internal/errors"
"github.com/restic/restic/internal/pack"
"github.com/restic/restic/internal/repository"
"github.com/restic/restic/internal/restic"
@ -233,7 +232,7 @@ func prune(opts PruneOptions, gopts GlobalOptions, repo restic.Repository, usedB
// iterate over all blobs in index to find out which blobs are duplicates
for blob := range repo.Index().Each(ctx) {
bh := blob.Handle()
size := uint64(pack.PackedSizeOfBlob(blob.Length))
size := uint64(blob.Length)
switch {
case usedBlobs.Has(bh): // used blob, move to keepBlobs
usedBlobs.Delete(bh)
@ -261,19 +260,28 @@ func prune(opts PruneOptions, gopts GlobalOptions, repo restic.Repository, usedB
indexPack := make(map[restic.ID]packInfo)
// save computed pack header size
for pid, hdrSize := range repo.Index().PackSize(ctx, true) {
// initialize tpe with NumBlobTypes to indicate it's not set
indexPack[pid] = packInfo{tpe: restic.NumBlobTypes, usedSize: uint64(hdrSize)}
}
// iterate over all blobs in index to generate packInfo
for blob := range repo.Index().Each(ctx) {
ip, ok := indexPack[blob.PackID]
if !ok {
ip = packInfo{tpe: blob.Type, usedSize: pack.HeaderSize}
ip := indexPack[blob.PackID]
// Set blob type if not yet set
if ip.tpe == restic.NumBlobTypes {
ip.tpe = blob.Type
}
// mark mixed packs with "Invalid blob type"
if ip.tpe != blob.Type {
ip.tpe = restic.InvalidBlob
}
bh := blob.Handle()
size := uint64(pack.PackedSizeOfBlob(blob.Length))
size := uint64(blob.Length)
switch {
case duplicateBlobs.Has(bh): // duplicate blob
ip.usedSize += size

View file

@ -1,7 +1,6 @@
package main
import (
"github.com/restic/restic/internal/pack"
"github.com/restic/restic/internal/repository"
"github.com/restic/restic/internal/restic"
@ -91,17 +90,7 @@ func rebuildIndex(opts RebuildIndexOptions, gopts GlobalOptions, repo *repositor
}
Verbosef("getting pack files to read...\n")
// Compute size of each pack from index entries
packSizeFromIndex := make(map[restic.ID]int64)
for blob := range repo.Index().Each(ctx) {
size, ok := packSizeFromIndex[blob.PackID]
if !ok {
size = pack.HeaderSize
}
// update packSizeFromIndex
packSizeFromIndex[blob.PackID] = size + int64(pack.PackedSizeOfBlob(blob.Length))
}
packSizeFromIndex := repo.Index().PackSize(ctx, false)
err = repo.List(ctx, restic.PackFile, func(id restic.ID, packSize int64) error {
size, ok := packSizeFromIndex[id]

View file

@ -178,13 +178,7 @@ func (c *Checker) LoadIndex(ctx context.Context) (hints []error, errs []error) {
c.masterIndex.MergeFinalIndexes()
// compute pack size using index entries
for blob := range c.masterIndex.Each(ctx) {
size, ok := c.packs[blob.PackID]
if !ok {
size = pack.HeaderSize
}
c.packs[blob.PackID] = size + int64(pack.PackedSizeOfBlob(blob.Length))
}
c.packs = c.masterIndex.PackSize(ctx, false)
debug.Log("checking for duplicate packs")
for packID := range c.packs {
@ -749,17 +743,17 @@ func checkPack(ctx context.Context, r restic.Repository, id restic.ID, size int6
return errors.Errorf("Pack size does not match, want %v, got %v", size, realSize)
}
blobs, err := pack.List(r.Key(), packfile, size)
blobs, hdrSize, err := pack.List(r.Key(), packfile, size)
if err != nil {
return err
}
var errs []error
var buf []byte
sizeFromBlobs := int64(pack.HeaderSize) // pack size computed only from blob information
sizeFromBlobs := uint(hdrSize)
idx := r.Index()
for i, blob := range blobs {
sizeFromBlobs += int64(pack.PackedSizeOfBlob(blob.Length))
sizeFromBlobs += blob.Length
debug.Log(" check blob %d: %v", i, blob)
buf = buf[:cap(buf)]
@ -809,7 +803,7 @@ func checkPack(ctx context.Context, r restic.Repository, id restic.ID, size int6
}
}
if sizeFromBlobs != size {
if int64(sizeFromBlobs) != size {
debug.Log("Pack size does not match, want %v, got %v", size, sizeFromBlobs)
errs = append(errs, errors.Errorf("Pack size does not match, want %v, got %v", size, sizeFromBlobs))
}

View file

@ -46,7 +46,7 @@ func (p *Packer) Add(t restic.BlobType, id restic.ID, data []byte) (int, error)
return n, errors.Wrap(err, "Write")
}
var entrySize = uint(binary.Size(restic.BlobType(0)) + binary.Size(uint32(0)) + len(restic.ID{}))
var EntrySize = uint(binary.Size(restic.BlobType(0)) + headerLengthSize + len(restic.ID{}))
// headerEntry describes the format of header entries. It serves only as
// documentation.
@ -88,7 +88,7 @@ func (p *Packer) Finalize() (uint, error) {
bytesWritten += uint(hdrBytes)
// write length
err = binary.Write(p.wr, binary.LittleEndian, uint32(restic.CiphertextLength(len(p.blobs)*int(entrySize))))
err = binary.Write(p.wr, binary.LittleEndian, uint32(restic.CiphertextLength(len(p.blobs)*int(EntrySize))))
if err != nil {
return 0, errors.Wrap(err, "binary.Write")
}
@ -100,7 +100,7 @@ func (p *Packer) Finalize() (uint, error) {
// makeHeader constructs the header for p.
func (p *Packer) makeHeader() ([]byte, error) {
buf := make([]byte, 0, len(p.blobs)*int(entrySize))
buf := make([]byte, 0, len(p.blobs)*int(EntrySize))
for _, b := range p.blobs {
switch b.Type {
@ -151,7 +151,7 @@ func (p *Packer) String() string {
var (
// we require at least one entry in the header, and one blob for a pack file
minFileSize = entrySize + crypto.Extension + uint(headerLengthSize)
minFileSize = EntrySize + crypto.Extension + uint(headerLengthSize)
)
const (
@ -171,7 +171,7 @@ const (
// the appropriate size.
func readRecords(rd io.ReaderAt, size int64, max int) ([]byte, int, error) {
var bufsize int
bufsize += max * int(entrySize)
bufsize += max * int(EntrySize)
bufsize += crypto.Extension
bufsize += headerLengthSize
@ -195,7 +195,7 @@ func readRecords(rd io.ReaderAt, size int64, max int) ([]byte, int, error) {
err = InvalidFileError{Message: "header length is zero"}
case hlen < crypto.Extension:
err = InvalidFileError{Message: "header length is too small"}
case (hlen-crypto.Extension)%uint32(entrySize) != 0:
case (hlen-crypto.Extension)%uint32(EntrySize) != 0:
err = InvalidFileError{Message: "header length is invalid"}
case int64(hlen) > size-int64(headerLengthSize):
err = InvalidFileError{Message: "header is larger than file"}
@ -206,7 +206,7 @@ func readRecords(rd io.ReaderAt, size int64, max int) ([]byte, int, error) {
return nil, 0, errors.Wrap(err, "readHeader")
}
total := (int(hlen) - crypto.Extension) / int(entrySize)
total := (int(hlen) - crypto.Extension) / int(EntrySize)
if total < max {
// truncate to the beginning of the pack header
b = b[len(b)-int(hlen):]
@ -252,52 +252,55 @@ func (e InvalidFileError) Error() string {
return e.Message
}
// List returns the list of entries found in a pack file.
func List(k *crypto.Key, rd io.ReaderAt, size int64) (entries []restic.Blob, err error) {
// List returns the list of entries found in a pack file and the length of the
// header (including header size and crypto overhead)
func List(k *crypto.Key, rd io.ReaderAt, size int64) (entries []restic.Blob, hdrSize uint32, err error) {
buf, err := readHeader(rd, size)
if err != nil {
return nil, err
return nil, 0, err
}
if len(buf) < k.NonceSize()+k.Overhead() {
return nil, errors.New("invalid header, too small")
return nil, 0, errors.New("invalid header, too small")
}
hdrSize = headerLengthSize + uint32(len(buf))
nonce, buf := buf[:k.NonceSize()], buf[k.NonceSize():]
buf, err = k.Open(buf[:0], nonce, buf, nil)
if err != nil {
return nil, err
return nil, 0, err
}
entries = make([]restic.Blob, 0, uint(len(buf))/entrySize)
entries = make([]restic.Blob, 0, uint(len(buf))/EntrySize)
pos := uint(0)
for len(buf) > 0 {
entry, err := parseHeaderEntry(buf)
if err != nil {
return nil, err
return nil, 0, err
}
entry.Offset = pos
entries = append(entries, entry)
pos += entry.Length
buf = buf[entrySize:]
buf = buf[EntrySize:]
}
return entries, nil
return entries, hdrSize, nil
}
// PackedSizeOfBlob returns the size a blob actually uses when saved in a pack
func PackedSizeOfBlob(blobLength uint) uint {
return blobLength + entrySize
return blobLength + EntrySize
}
func parseHeaderEntry(p []byte) (b restic.Blob, err error) {
if uint(len(p)) < entrySize {
if uint(len(p)) < EntrySize {
err = errors.Errorf("parseHeaderEntry: buffer of size %d too short", len(p))
return b, err
}
p = p[:entrySize]
p = p[:EntrySize]
switch p[0] {
case 0:

View file

@ -41,7 +41,7 @@ func TestParseHeaderEntry(t *testing.T) {
buf.Reset()
_ = binary.Write(buf, binary.LittleEndian, &h)
b, err = parseHeaderEntry(buf.Bytes()[:entrySize-1])
b, err = parseHeaderEntry(buf.Bytes()[:EntrySize-1])
rtest.Assert(t, err != nil, "no error for short input")
}
@ -58,7 +58,7 @@ func (rd *countingReaderAt) ReadAt(p []byte, off int64) (n int, err error) {
func TestReadHeaderEagerLoad(t *testing.T) {
testReadHeader := func(dataSize, entryCount, expectedReadInvocationCount int) {
expectedHeader := rtest.Random(0, entryCount*int(entrySize)+crypto.Extension)
expectedHeader := rtest.Random(0, entryCount*int(EntrySize)+crypto.Extension)
buf := &bytes.Buffer{}
buf.Write(rtest.Random(0, dataSize)) // pack blobs data
@ -83,8 +83,8 @@ func TestReadHeaderEagerLoad(t *testing.T) {
testReadHeader(100, eagerEntries+1, 2)
// file size == eager header load size
eagerLoadSize := int((eagerEntries * entrySize) + crypto.Extension)
headerSize := int(1*entrySize) + crypto.Extension
eagerLoadSize := int((eagerEntries * EntrySize) + crypto.Extension)
headerSize := int(1*EntrySize) + crypto.Extension
dataSize := eagerLoadSize - headerSize - binary.Size(uint32(0))
testReadHeader(dataSize-1, 1, 1)
testReadHeader(dataSize, 1, 1)
@ -96,8 +96,8 @@ func TestReadHeaderEagerLoad(t *testing.T) {
func TestReadRecords(t *testing.T) {
testReadRecords := func(dataSize, entryCount, totalRecords int) {
totalHeader := rtest.Random(0, totalRecords*int(entrySize)+crypto.Extension)
off := len(totalHeader) - (entryCount*int(entrySize) + crypto.Extension)
totalHeader := rtest.Random(0, totalRecords*int(EntrySize)+crypto.Extension)
off := len(totalHeader) - (entryCount*int(EntrySize) + crypto.Extension)
if off < 0 {
off = 0
}
@ -127,8 +127,8 @@ func TestReadRecords(t *testing.T) {
testReadRecords(100, eagerEntries, eagerEntries+1)
// file size == eager header load size
eagerLoadSize := int((eagerEntries * entrySize) + crypto.Extension)
headerSize := int(1*entrySize) + crypto.Extension
eagerLoadSize := int((eagerEntries * EntrySize) + crypto.Extension)
headerSize := int(1*EntrySize) + crypto.Extension
dataSize := eagerLoadSize - headerSize - binary.Size(uint32(0))
testReadRecords(dataSize-1, 1, 1)
testReadRecords(dataSize, 1, 1)

View file

@ -53,19 +53,18 @@ func verifyBlobs(t testing.TB, bufs []Buf, k *crypto.Key, rd io.ReaderAt, packSi
for _, buf := range bufs {
written += len(buf.data)
}
// header length
written += binary.Size(uint32(0))
// header + header crypto
headerSize := len(bufs) * (binary.Size(restic.BlobType(0)) + binary.Size(uint32(0)) + len(restic.ID{}))
written += restic.CiphertextLength(headerSize)
// header length + header + header crypto
headerSize := binary.Size(uint32(0)) + restic.CiphertextLength(len(bufs)*int(pack.EntrySize))
written += headerSize
// check length
rtest.Equals(t, uint(written), packSize)
// read and parse it again
entries, err := pack.List(k, rd, int64(packSize))
entries, hdrSize, err := pack.List(k, rd, int64(packSize))
rtest.OK(t, err)
rtest.Equals(t, len(entries), len(bufs))
rtest.Equals(t, headerSize, int(hdrSize))
var buf []byte
for i, b := range bufs {

View file

@ -5,6 +5,7 @@ import (
"sync"
"github.com/restic/restic/internal/debug"
"github.com/restic/restic/internal/pack"
"github.com/restic/restic/internal/restic"
"github.com/restic/restic/internal/ui/progress"
"golang.org/x/sync/errgroup"
@ -111,6 +112,27 @@ func (mi *MasterIndex) Packs() restic.IDSet {
return packs
}
// PackSize returns the size of all packs computed by index information.
// If onlyHdr is set to true, only the size of the header is returned
// Note that this function only gives correct sizes, if there are no
// duplicates in the index.
func (mi *MasterIndex) PackSize(ctx context.Context, onlyHdr bool) map[restic.ID]int64 {
packSize := make(map[restic.ID]int64)
for blob := range mi.Each(ctx) {
size, ok := packSize[blob.PackID]
if !ok {
size = pack.HeaderSize
}
if !onlyHdr {
size += int64(blob.Length)
}
packSize[blob.PackID] = size + int64(pack.EntrySize)
}
return packSize
}
// Count returns the number of blobs of type t in the index.
func (mi *MasterIndex) Count(t restic.BlobType) (n uint) {
mi.idxMutex.RLock()

View file

@ -92,7 +92,7 @@ func Repack(ctx context.Context, repo restic.Repository, packs restic.IDSet, kee
for job := range processQueue {
tempfile, packID, packLength := job.tempfile, job.hash, job.packLength
blobs, err := pack.List(repo.Key(), tempfile, packLength)
blobs, _, err := pack.List(repo.Key(), tempfile, packLength)
if err != nil {
return err
}

View file

@ -740,16 +740,11 @@ func (r *Repository) List(ctx context.Context, t restic.FileType, fn func(restic
}
// ListPack returns the list of blobs saved in the pack id and the length of
// the file as stored in the backend.
func (r *Repository) ListPack(ctx context.Context, id restic.ID, size int64) ([]restic.Blob, int64, error) {
// the the pack header.
func (r *Repository) ListPack(ctx context.Context, id restic.ID, size int64) ([]restic.Blob, uint32, error) {
h := restic.Handle{Type: restic.PackFile, Name: id.String()}
blobs, err := pack.List(r.Key(), restic.ReaderAt(ctx, r.Backend(), h), size)
if err != nil {
return nil, 0, err
}
return blobs, size, nil
return pack.List(r.Key(), restic.ReaderAt(ctx, r.Backend(), h), size)
}
// Delete calls backend.Delete() if implemented, and returns an error

View file

@ -32,7 +32,10 @@ type Repository interface {
//
// The function fn is called in the same Goroutine List() was called from.
List(ctx context.Context, t FileType, fn func(ID, int64) error) error
ListPack(context.Context, ID, int64) ([]Blob, int64, error)
// ListPack returns the list of blobs saved in the pack id and the length of
// the the pack header.
ListPack(context.Context, ID, int64) ([]Blob, uint32, error)
Flush(context.Context) error
@ -63,6 +66,7 @@ type MasterIndex interface {
Lookup(ID, BlobType) []PackedBlob
Count(BlobType) uint
Packs() IDSet
PackSize(ctx context.Context, onlyHdr bool) map[ID]int64
// Each returns a channel that yields all blobs known to the index. When
// the context is cancelled, the background goroutine terminates. This