From ffca60231570f7a23f8304011a98c85926117433 Mon Sep 17 00:00:00 2001 From: Michael Eischer Date: Sun, 11 Sep 2022 13:51:08 +0200 Subject: [PATCH 01/11] repository: Fix panic in benchmarkLoadIndex --- internal/repository/repository_test.go | 1 + 1 file changed, 1 insertion(+) diff --git a/internal/repository/repository_test.go b/internal/repository/repository_test.go index f3516856e..f26bf46f2 100644 --- a/internal/repository/repository_test.go +++ b/internal/repository/repository_test.go @@ -346,6 +346,7 @@ func benchmarkLoadIndex(b *testing.B, version uint) { }, }) } + idx.Finalize() id, err := index.SaveIndex(context.TODO(), repo, idx) rtest.OK(b, err) From 0c1240360dbf7ff04423ff3d2dfcabf7600a290e Mon Sep 17 00:00:00 2001 From: Michael Eischer Date: Sat, 22 Oct 2022 23:37:31 +0200 Subject: [PATCH 02/11] index: add garbage collection benchmark Allocates an index and repeatedly triggers the GC. --- internal/index/master_index_test.go | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/internal/index/master_index_test.go b/internal/index/master_index_test.go index 9a1970827..5d12956bd 100644 --- a/internal/index/master_index_test.go +++ b/internal/index/master_index_test.go @@ -4,6 +4,7 @@ import ( "context" "fmt" "math/rand" + "runtime" "testing" "time" @@ -323,6 +324,17 @@ func BenchmarkMasterIndexEach(b *testing.B) { } } +func BenchmarkMasterIndexGC(b *testing.B) { + mIdx, _ := createRandomMasterIndex(b, rand.New(rand.NewSource(0)), 100, 10000) + + b.ResetTimer() + + for i := 0; i < b.N; i++ { + runtime.GC() + } + runtime.KeepAlive(mIdx) +} + var ( snapshotTime = time.Unix(1470492820, 207401672) depth = 3 From b217f38ee70d064b18c10378eccfc190c254743d Mon Sep 17 00:00:00 2001 From: Michael Eischer Date: Sat, 5 Feb 2022 21:25:23 +0100 Subject: [PATCH 03/11] index: Remove pointers from within indexentrys The indexEntry objects are now allocated in a separate array. References to an indexEntry are now stored as array indices. This has the benefit of allowing the garbage collector to ignore the indexEntry objects as these do not contain pointers and are part of a single large allocation. --- internal/index/indexmap.go | 78 +++++++++++++++++--------------------- 1 file changed, 35 insertions(+), 43 deletions(-) diff --git a/internal/index/indexmap.go b/internal/index/indexmap.go index ef3539d48..60ab11ff7 100644 --- a/internal/index/indexmap.go +++ b/internal/index/indexmap.go @@ -17,12 +17,12 @@ import ( // needs to be resized when the table grows, preventing memory usage spikes. type indexMap struct { // The number of buckets is always a power of two and never zero. - buckets []*indexEntry + buckets []uint numentries uint mh maphash.Hash - free *indexEntry // Free list. + blockList []indexEntry } const ( @@ -41,7 +41,7 @@ func (m *indexMap) add(id restic.ID, packIdx int, offset, length uint32, uncompr } h := m.hash(id) - e := m.newEntry() + e, idx := m.newEntry() e.id = id e.next = m.buckets[h] // Prepend to existing chain. e.packIndex = packIdx @@ -49,18 +49,19 @@ func (m *indexMap) add(id restic.ID, packIdx int, offset, length uint32, uncompr e.length = length e.uncompressedLength = uncompressedLength - m.buckets[h] = e + m.buckets[h] = idx m.numentries++ } // foreach calls fn for all entries in the map, until fn returns false. func (m *indexMap) foreach(fn func(*indexEntry) bool) { - for _, e := range m.buckets { - for e != nil { + for _, ei := range m.buckets { + for ei != 0 { + e := m.resolve(ei) if !fn(e) { return } - e = e.next + ei = e.next } } } @@ -72,7 +73,10 @@ func (m *indexMap) foreachWithID(id restic.ID, fn func(*indexEntry)) { } h := m.hash(id) - for e := m.buckets[h]; e != nil; e = e.next { + ei := m.buckets[h] + for ei != 0 { + e := m.resolve(ei) + ei = e.next if e.id != id { continue } @@ -87,25 +91,29 @@ func (m *indexMap) get(id restic.ID) *indexEntry { } h := m.hash(id) - for e := m.buckets[h]; e != nil; e = e.next { + ei := m.buckets[h] + for ei != 0 { + e := m.resolve(ei) if e.id == id { return e } + ei = e.next } return nil } func (m *indexMap) grow() { old := m.buckets - m.buckets = make([]*indexEntry, growthFactor*len(m.buckets)) + m.buckets = make([]uint, growthFactor*len(m.buckets)) - for _, e := range old { - for e != nil { + for _, ei := range old { + for ei != 0 { + e := m.resolve(ei) h := m.hash(e.id) next := e.next e.next = m.buckets[h] - m.buckets[h] = e - e = next + m.buckets[h] = ei + ei = next } } } @@ -124,45 +132,29 @@ func (m *indexMap) hash(id restic.ID) uint { func (m *indexMap) init() { const initialBuckets = 64 - m.buckets = make([]*indexEntry, initialBuckets) + m.buckets = make([]uint, initialBuckets) + // first entry in blockList serves as null byte + m.blockList = make([]indexEntry, 1) } func (m *indexMap) len() uint { return m.numentries } -func (m *indexMap) newEntry() *indexEntry { - // We keep a free list of objects to speed up allocation and GC. - // There's an obvious trade-off here: allocating in larger batches - // means we allocate faster and the GC has to keep fewer bits to track - // what we have in use, but it means we waste some space. - // - // Then again, allocating each indexEntry separately also wastes space - // on 32-bit platforms, because the Go malloc has no size class for - // exactly 52 bytes, so it puts the indexEntry in a 64-byte slot instead. - // See src/runtime/sizeclasses.go in the Go source repo. - // - // The batch size of 4 means we hit the size classes for 4×64=256 bytes - // (64-bit) and 4×52=208 bytes (32-bit), wasting nothing in malloc on - // 64-bit and relatively little on 32-bit. - const entryAllocBatch = 4 +func (m *indexMap) newEntry() (*indexEntry, uint) { + m.blockList = append(m.blockList, indexEntry{}) - e := m.free - if e != nil { - m.free = e.next - } else { - free := new([entryAllocBatch]indexEntry) - e = &free[0] - for i := 1; i < len(free)-1; i++ { - free[i].next = &free[i+1] - } - m.free = &free[1] - } + idx := uint(len(m.blockList) - 1) + e := &m.blockList[idx] - return e + return e, idx +} + +func (m *indexMap) resolve(idx uint) *indexEntry { + return &m.blockList[idx] } type indexEntry struct { id restic.ID - next *indexEntry + next uint packIndex int // Position in containing Index's packs field. offset uint32 length uint32 From fed33295c38197cec49d164725e744c1c9786690 Mon Sep 17 00:00:00 2001 From: Michael Eischer Date: Sun, 28 May 2023 23:42:47 +0200 Subject: [PATCH 04/11] index: store indexEntries in hashed array tree This data structure reduces the wasted memory to O(sqrt(n)). The top-layer of the hashed array tree (HAT) also has a size of O(sqrt(n)), which makes it cache efficient. The top-layer should be small enough to easily fit into the CPU cache and thus only adds little overhead compared to directly accessing an index entry via a pointer. --- internal/index/indexmap.go | 94 ++++++++++++++++++++++++++++++++++---- 1 file changed, 85 insertions(+), 9 deletions(-) diff --git a/internal/index/indexmap.go b/internal/index/indexmap.go index 60ab11ff7..811d20903 100644 --- a/internal/index/indexmap.go +++ b/internal/index/indexmap.go @@ -1,6 +1,7 @@ package index import ( + "fmt" "hash/maphash" "github.com/restic/restic/internal/restic" @@ -22,7 +23,7 @@ type indexMap struct { mh maphash.Hash - blockList []indexEntry + blockList hashedArrayTree } const ( @@ -134,22 +135,18 @@ func (m *indexMap) init() { const initialBuckets = 64 m.buckets = make([]uint, initialBuckets) // first entry in blockList serves as null byte - m.blockList = make([]indexEntry, 1) + m.blockList = *newHAT() + m.newEntry() } func (m *indexMap) len() uint { return m.numentries } func (m *indexMap) newEntry() (*indexEntry, uint) { - m.blockList = append(m.blockList, indexEntry{}) - - idx := uint(len(m.blockList) - 1) - e := &m.blockList[idx] - - return e, idx + return m.blockList.Alloc() } func (m *indexMap) resolve(idx uint) *indexEntry { - return &m.blockList[idx] + return m.blockList.Ref(idx) } type indexEntry struct { @@ -160,3 +157,82 @@ type indexEntry struct { length uint32 uncompressedLength uint32 } + +type hashedArrayTree struct { + mask uint + maskShift uint + blockSize uint + + size uint + blockList [][]indexEntry +} + +func newHAT() *hashedArrayTree { + // start with a small block size + blockSizePower := uint(2) + blockSize := uint(1 << blockSizePower) + + return &hashedArrayTree{ + mask: blockSize - 1, + maskShift: blockSizePower, + blockSize: blockSize, + size: 0, + blockList: make([][]indexEntry, blockSize), + } +} + +func (h *hashedArrayTree) Alloc() (*indexEntry, uint) { + h.grow() + size := h.size + idx, subIdx := h.index(size) + h.size++ + return &h.blockList[idx][subIdx], size +} + +func (h *hashedArrayTree) index(pos uint) (idx uint, subIdx uint) { + subIdx = pos & h.mask + idx = pos >> h.maskShift + return +} + +func (h *hashedArrayTree) Ref(pos uint) *indexEntry { + if pos >= h.size { + panic(fmt.Sprintf("array index %d out of bounds %d", pos, h.size)) + } + + idx, subIdx := h.index(pos) + return &h.blockList[idx][subIdx] +} + +func (h *hashedArrayTree) Size() uint { + return h.size +} + +func (h *hashedArrayTree) grow() { + idx, subIdx := h.index(h.size) + if int(idx) == len(h.blockList) { + // blockList is too small -> double list and block size + oldBlocks := h.blockList + h.blockList = make([][]indexEntry, h.blockSize) + + h.blockSize *= 2 + h.mask = h.mask*2 + 1 + h.maskShift++ + idx = idx / 2 + + // pairwise merging of blocks + for i := 0; i < len(oldBlocks); i += 2 { + block := make([]indexEntry, 0, h.blockSize) + block = append(block, oldBlocks[i]...) + block = append(block, oldBlocks[i+1]...) + h.blockList[i/2] = block + // allow GC + oldBlocks[i] = nil + oldBlocks[i+1] = nil + } + } + if subIdx == 0 { + // new index entry batch + h.blockList[idx] = make([]indexEntry, h.blockSize) + } +} From 12141afbada61d03495ef2830732e4d2c406fce4 Mon Sep 17 00:00:00 2001 From: Michael Eischer Date: Mon, 29 May 2023 01:09:33 +0200 Subject: [PATCH 05/11] index: Allow inlining of HAT --- internal/index/indexmap.go | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/internal/index/indexmap.go b/internal/index/indexmap.go index 811d20903..121ed09af 100644 --- a/internal/index/indexmap.go +++ b/internal/index/indexmap.go @@ -1,7 +1,6 @@ package index import ( - "fmt" "hash/maphash" "github.com/restic/restic/internal/restic" @@ -197,7 +196,7 @@ func (h *hashedArrayTree) index(pos uint) (idx uint, subIdx uint) { func (h *hashedArrayTree) Ref(pos uint) *indexEntry { if pos >= h.size { - panic(fmt.Sprintf("array index %d out of bounds %d", pos, h.size)) + panic("array index out of bounds") } idx, subIdx := h.index(pos) From f1c388c623fee735871067b912e6f5632f33f772 Mon Sep 17 00:00:00 2001 From: Michael Eischer Date: Mon, 29 May 2023 00:13:32 +0200 Subject: [PATCH 06/11] index: remove redundant storage of indexmap size --- internal/index/indexmap.go | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/internal/index/indexmap.go b/internal/index/indexmap.go index 121ed09af..dfb4a1422 100644 --- a/internal/index/indexmap.go +++ b/internal/index/indexmap.go @@ -2,6 +2,7 @@ package index import ( "hash/maphash" + "math" "github.com/restic/restic/internal/restic" ) @@ -17,8 +18,7 @@ import ( // needs to be resized when the table grows, preventing memory usage spikes. type indexMap struct { // The number of buckets is always a power of two and never zero. - buckets []uint - numentries uint + buckets []uint mh maphash.Hash @@ -34,9 +34,9 @@ const ( // using id as the key. func (m *indexMap) add(id restic.ID, packIdx int, offset, length uint32, uncompressedLength uint32) { switch { - case m.numentries == 0: // Lazy initialization. + case m.len() == math.MaxUint: // Lazy initialization. m.init() - case m.numentries >= maxLoad*uint(len(m.buckets)): + case m.len() >= maxLoad*uint(len(m.buckets)): m.grow() } @@ -50,7 +50,6 @@ func (m *indexMap) add(id restic.ID, packIdx int, offset, length uint32, uncompr e.uncompressedLength = uncompressedLength m.buckets[h] = idx - m.numentries++ } // foreach calls fn for all entries in the map, until fn returns false. @@ -138,7 +137,7 @@ func (m *indexMap) init() { m.newEntry() } -func (m *indexMap) len() uint { return m.numentries } +func (m *indexMap) len() uint { return m.blockList.Size() - 1 } func (m *indexMap) newEntry() (*indexEntry, uint) { return m.blockList.Alloc() From fc05e35a08c689df0d03fc7d4bcb5bb13b0548b1 Mon Sep 17 00:00:00 2001 From: Michael Eischer Date: Tue, 30 May 2023 20:12:36 +0200 Subject: [PATCH 07/11] index: let indexmap.Each iterate in allocation order Iterating through the indexmap according to the bucket order has the problem that all indexEntries are accessed in random order which is rather cache inefficient. As we already keep a list of all allocated blocks, just iterate through it. This allows iterating through a batch of indexEntries without random memory accesses. In addition, the packID will likely remain similar across multiple blobs as all blobs of a pack file are added as a single batch. --- internal/index/indexmap.go | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/internal/index/indexmap.go b/internal/index/indexmap.go index dfb4a1422..15f253d76 100644 --- a/internal/index/indexmap.go +++ b/internal/index/indexmap.go @@ -54,13 +54,10 @@ func (m *indexMap) add(id restic.ID, packIdx int, offset, length uint32, uncompr // foreach calls fn for all entries in the map, until fn returns false. func (m *indexMap) foreach(fn func(*indexEntry) bool) { - for _, ei := range m.buckets { - for ei != 0 { - e := m.resolve(ei) - if !fn(e) { - return - } - ei = e.next + blockCount := m.blockList.Size() + for i := uint(1); i < blockCount; i++ { + if !fn(m.resolve(i)) { + return } } } From 9a7056a4790ea27c109d6594cbf38b1e34ad4873 Mon Sep 17 00:00:00 2001 From: Michael Eischer Date: Tue, 30 May 2023 20:13:33 +0200 Subject: [PATCH 08/11] index: implement indexmap.grow() without random access --- internal/index/indexmap.go | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/internal/index/indexmap.go b/internal/index/indexmap.go index 15f253d76..c709f7b3a 100644 --- a/internal/index/indexmap.go +++ b/internal/index/indexmap.go @@ -99,18 +99,15 @@ func (m *indexMap) get(id restic.ID) *indexEntry { } func (m *indexMap) grow() { - old := m.buckets m.buckets = make([]uint, growthFactor*len(m.buckets)) - for _, ei := range old { - for ei != 0 { - e := m.resolve(ei) - h := m.hash(e.id) - next := e.next - e.next = m.buckets[h] - m.buckets[h] = ei - ei = next - } + blockCount := m.blockList.Size() + for i := uint(1); i < blockCount; i++ { + e := m.resolve(i) + + h := m.hash(e.id) + e.next = m.buckets[h] + m.buckets[h] = i } } From ac1dfc99bb01205dab5d99abcea8c58e3439975e Mon Sep 17 00:00:00 2001 From: Michael Eischer Date: Fri, 2 Jun 2023 19:39:12 +0200 Subject: [PATCH 09/11] index: fix blocklist size --- internal/index/indexmap.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/internal/index/indexmap.go b/internal/index/indexmap.go index c709f7b3a..b779b0527 100644 --- a/internal/index/indexmap.go +++ b/internal/index/indexmap.go @@ -204,14 +204,14 @@ func (h *hashedArrayTree) grow() { idx, subIdx := h.index(h.size) if int(idx) == len(h.blockList) { // blockList is too small -> double list and block size - oldBlocks := h.blockList - h.blockList = make([][]indexEntry, h.blockSize) - h.blockSize *= 2 h.mask = h.mask*2 + 1 h.maskShift++ idx = idx / 2 + oldBlocks := h.blockList + h.blockList = make([][]indexEntry, h.blockSize) + // pairwise merging of blocks for i := 0; i < len(oldBlocks); i += 2 { block := make([]indexEntry, 0, h.blockSize) From 55c21846b1d7a6d62126340bfc2aaef5f10981ca Mon Sep 17 00:00:00 2001 From: Michael Eischer Date: Thu, 8 Jun 2023 18:07:06 +0200 Subject: [PATCH 10/11] Revert "index: remove redundant storage of indexmap size" This reverts commit f1c388c623fee735871067b912e6f5632f33f772. For an uninitialized indexmap the returned size was `-1` which is unexpected and could cause problems. --- internal/index/indexmap.go | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/internal/index/indexmap.go b/internal/index/indexmap.go index b779b0527..2386e01b6 100644 --- a/internal/index/indexmap.go +++ b/internal/index/indexmap.go @@ -2,7 +2,6 @@ package index import ( "hash/maphash" - "math" "github.com/restic/restic/internal/restic" ) @@ -18,7 +17,8 @@ import ( // needs to be resized when the table grows, preventing memory usage spikes. type indexMap struct { // The number of buckets is always a power of two and never zero. - buckets []uint + buckets []uint + numentries uint mh maphash.Hash @@ -34,9 +34,9 @@ const ( // using id as the key. func (m *indexMap) add(id restic.ID, packIdx int, offset, length uint32, uncompressedLength uint32) { switch { - case m.len() == math.MaxUint: // Lazy initialization. + case m.numentries == 0: // Lazy initialization. m.init() - case m.len() >= maxLoad*uint(len(m.buckets)): + case m.numentries >= maxLoad*uint(len(m.buckets)): m.grow() } @@ -50,6 +50,7 @@ func (m *indexMap) add(id restic.ID, packIdx int, offset, length uint32, uncompr e.uncompressedLength = uncompressedLength m.buckets[h] = idx + m.numentries++ } // foreach calls fn for all entries in the map, until fn returns false. @@ -131,7 +132,7 @@ func (m *indexMap) init() { m.newEntry() } -func (m *indexMap) len() uint { return m.blockList.Size() - 1 } +func (m *indexMap) len() uint { return m.numentries } func (m *indexMap) newEntry() (*indexEntry, uint) { return m.blockList.Alloc() From b2ed42cec45f06b17b86fbab745d85110f2ed3b1 Mon Sep 17 00:00:00 2001 From: Michael Eischer Date: Fri, 16 Jun 2023 23:12:30 +0200 Subject: [PATCH 11/11] index: add basic hat test --- internal/index/indexmap_test.go | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/internal/index/indexmap_test.go b/internal/index/indexmap_test.go index 391131ca0..a16670c7d 100644 --- a/internal/index/indexmap_test.go +++ b/internal/index/indexmap_test.go @@ -108,6 +108,21 @@ func TestIndexMapForeachWithID(t *testing.T) { } } +func TestHashedArrayTree(t *testing.T) { + hat := newHAT() + const testSize = 1024 + for i := uint(0); i < testSize; i++ { + rtest.Assert(t, hat.Size() == i, "expected hat size %v got %v", i, hat.Size()) + e, idx := hat.Alloc() + rtest.Assert(t, idx == i, "expected entry at idx %v got %v", i, idx) + e.length = uint32(i) + } + for i := uint(0); i < testSize; i++ { + e := hat.Ref(i) + rtest.Assert(t, e.length == uint32(i), "expected entry to contain %v got %v", uint32(i), e.length) + } +} + func BenchmarkIndexMapHash(b *testing.B) { var m indexMap m.add(restic.ID{}, 0, 0, 0, 0) // Trigger lazy initialization.