forked from TrueCloudLab/restic
index: store indexEntries in hashed array tree
This data structure reduces the wasted memory to O(sqrt(n)). The top-layer of the hashed array tree (HAT) also has a size of O(sqrt(n)), which makes it cache efficient. The top-layer should be small enough to easily fit into the CPU cache and thus only adds little overhead compared to directly accessing an index entry via a pointer.
This commit is contained in:
parent
b217f38ee7
commit
fed33295c3
1 changed files with 85 additions and 9 deletions
|
@ -1,6 +1,7 @@
|
|||
package index
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"hash/maphash"
|
||||
|
||||
"github.com/restic/restic/internal/restic"
|
||||
|
@ -22,7 +23,7 @@ type indexMap struct {
|
|||
|
||||
mh maphash.Hash
|
||||
|
||||
blockList []indexEntry
|
||||
blockList hashedArrayTree
|
||||
}
|
||||
|
||||
const (
|
||||
|
@ -134,22 +135,18 @@ func (m *indexMap) init() {
|
|||
const initialBuckets = 64
|
||||
m.buckets = make([]uint, initialBuckets)
|
||||
// first entry in blockList serves as null byte
|
||||
m.blockList = make([]indexEntry, 1)
|
||||
m.blockList = *newHAT()
|
||||
m.newEntry()
|
||||
}
|
||||
|
||||
func (m *indexMap) len() uint { return m.numentries }
|
||||
|
||||
func (m *indexMap) newEntry() (*indexEntry, uint) {
|
||||
m.blockList = append(m.blockList, indexEntry{})
|
||||
|
||||
idx := uint(len(m.blockList) - 1)
|
||||
e := &m.blockList[idx]
|
||||
|
||||
return e, idx
|
||||
return m.blockList.Alloc()
|
||||
}
|
||||
|
||||
func (m *indexMap) resolve(idx uint) *indexEntry {
|
||||
return &m.blockList[idx]
|
||||
return m.blockList.Ref(idx)
|
||||
}
|
||||
|
||||
type indexEntry struct {
|
||||
|
@ -160,3 +157,82 @@ type indexEntry struct {
|
|||
length uint32
|
||||
uncompressedLength uint32
|
||||
}
|
||||
|
||||
type hashedArrayTree struct {
|
||||
mask uint
|
||||
maskShift uint
|
||||
blockSize uint
|
||||
|
||||
size uint
|
||||
blockList [][]indexEntry
|
||||
}
|
||||
|
||||
func newHAT() *hashedArrayTree {
|
||||
// start with a small block size
|
||||
blockSizePower := uint(2)
|
||||
blockSize := uint(1 << blockSizePower)
|
||||
|
||||
return &hashedArrayTree{
|
||||
mask: blockSize - 1,
|
||||
maskShift: blockSizePower,
|
||||
blockSize: blockSize,
|
||||
size: 0,
|
||||
blockList: make([][]indexEntry, blockSize),
|
||||
}
|
||||
}
|
||||
|
||||
func (h *hashedArrayTree) Alloc() (*indexEntry, uint) {
|
||||
h.grow()
|
||||
size := h.size
|
||||
idx, subIdx := h.index(size)
|
||||
h.size++
|
||||
return &h.blockList[idx][subIdx], size
|
||||
}
|
||||
|
||||
func (h *hashedArrayTree) index(pos uint) (idx uint, subIdx uint) {
|
||||
subIdx = pos & h.mask
|
||||
idx = pos >> h.maskShift
|
||||
return
|
||||
}
|
||||
|
||||
func (h *hashedArrayTree) Ref(pos uint) *indexEntry {
|
||||
if pos >= h.size {
|
||||
panic(fmt.Sprintf("array index %d out of bounds %d", pos, h.size))
|
||||
}
|
||||
|
||||
idx, subIdx := h.index(pos)
|
||||
return &h.blockList[idx][subIdx]
|
||||
}
|
||||
|
||||
func (h *hashedArrayTree) Size() uint {
|
||||
return h.size
|
||||
}
|
||||
|
||||
func (h *hashedArrayTree) grow() {
|
||||
idx, subIdx := h.index(h.size)
|
||||
if int(idx) == len(h.blockList) {
|
||||
// blockList is too small -> double list and block size
|
||||
oldBlocks := h.blockList
|
||||
h.blockList = make([][]indexEntry, h.blockSize)
|
||||
|
||||
h.blockSize *= 2
|
||||
h.mask = h.mask*2 + 1
|
||||
h.maskShift++
|
||||
idx = idx / 2
|
||||
|
||||
// pairwise merging of blocks
|
||||
for i := 0; i < len(oldBlocks); i += 2 {
|
||||
block := make([]indexEntry, 0, h.blockSize)
|
||||
block = append(block, oldBlocks[i]...)
|
||||
block = append(block, oldBlocks[i+1]...)
|
||||
h.blockList[i/2] = block
|
||||
// allow GC
|
||||
oldBlocks[i] = nil
|
||||
oldBlocks[i+1] = nil
|
||||
}
|
||||
}
|
||||
if subIdx == 0 {
|
||||
// new index entry batch
|
||||
h.blockList[idx] = make([]indexEntry, h.blockSize)
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue