2022-06-12 12:43:43 +00:00
|
|
|
package index
|
2020-06-23 20:13:25 +00:00
|
|
|
|
|
|
|
import (
|
Replace siphash by hash/maphash
In Go 1.17.1, maphash has become quite a bit faster than siphash, so we
can drop one third-party dependency. maphash is just an interface to the
standard Go map's hash function, which we already trust for other use
cases.
Benchmark results on linux/amd64, -benchtime=3s:
name old time/op new time/op delta
IndexHasUnknown-8 50.6ns ±10% 41.0ns ±19% -18.92% (p=0.000 n=9+10)
IndexHasKnown-8 52.6ns ±12% 41.5ns ±12% -21.13% (p=0.000 n=9+10)
IndexMapHash-8 3.64µs ± 1% 2.00µs ± 0% -45.09% (p=0.000 n=10+9)
IndexAlloc-8 700ms ± 1% 601ms ± 6% -14.18% (p=0.000 n=8+10)
IndexAllocParallel-8 205ms ± 5% 192ms ± 8% -6.18% (p=0.043 n=10+10)
MasterIndexAlloc-8 319ms ± 1% 279ms ± 5% -12.58% (p=0.000 n=10+10)
MasterIndexLookupSingleIndex-8 156ns ± 8% 147ns ± 6% -5.46% (p=0.023 n=10+10)
MasterIndexLookupMultipleIndex-8 150ns ± 7% 142ns ± 8% -5.69% (p=0.007 n=10+10)
MasterIndexLookupSingleIndexUnknown-8 74.4ns ± 6% 72.0ns ± 9% ~ (p=0.175 n=10+9)
MasterIndexLookupMultipleIndexUnknown-8 67.4ns ± 9% 65.5ns ± 7% ~ (p=0.340 n=9+9)
MasterIndexLookupParallel/known,indices=25-8 461ns ± 2% 445ns ± 2% -3.49% (p=0.000 n=10+10)
MasterIndexLookupParallel/unknown,indices=25-8 408ns ±11% 378ns ± 5% -7.22% (p=0.035 n=10+9)
MasterIndexLookupParallel/known,indices=50-8 479ns ± 1% 437ns ± 4% -8.82% (p=0.000 n=10+10)
MasterIndexLookupParallel/unknown,indices=50-8 406ns ± 8% 343ns ±15% -15.44% (p=0.001 n=10+10)
MasterIndexLookupParallel/known,indices=100-8 480ns ± 1% 455ns ± 5% -5.15% (p=0.000 n=8+10)
MasterIndexLookupParallel/unknown,indices=100-8 391ns ±18% 382ns ± 8% ~ (p=0.315 n=10+10)
MasterIndexLookupBlobSize-8 71.0ns ± 8% 57.2ns ±11% -19.36% (p=0.000 n=9+10)
PackerManager-8 254ms ± 1% 254ms ± 1% ~ (p=0.285 n=15+15)
name old speed new speed delta
IndexMapHash-8 1.12GB/s ± 1% 2.05GB/s ± 0% +82.13% (p=0.000 n=10+9)
PackerManager-8 208MB/s ± 1% 207MB/s ± 1% ~ (p=0.281 n=15+15)
name old alloc/op new alloc/op delta
IndexMapHash-8 0.00B 0.00B ~ (all equal)
IndexAlloc-8 400MB ± 0% 400MB ± 0% ~ (p=1.000 n=9+10)
IndexAllocParallel-8 401MB ± 0% 401MB ± 0% +0.00% (p=0.000 n=10+10)
MasterIndexAlloc-8 258MB ± 0% 262MB ± 0% +1.42% (p=0.000 n=9+10)
PackerManager-8 73.1kB ± 0% 73.1kB ± 0% ~ (p=0.382 n=13+13)
name old allocs/op new allocs/op delta
IndexMapHash-8 0.00 0.00 ~ (all equal)
IndexAlloc-8 907k ± 0% 907k ± 0% -0.00% (p=0.000 n=10+10)
IndexAllocParallel-8 907k ± 0% 907k ± 0% +0.00% (p=0.009 n=10+10)
MasterIndexAlloc-8 327k ± 0% 317k ± 0% -3.06% (p=0.000 n=10+10)
PackerManager-8 744 ± 0% 744 ± 0% ~ (all equal)
2021-09-17 10:38:17 +00:00
|
|
|
"hash/maphash"
|
2020-06-23 20:13:25 +00:00
|
|
|
|
|
|
|
"github.com/restic/restic/internal/restic"
|
|
|
|
)
|
|
|
|
|
|
|
|
// An indexMap is a chained hash table that maps blob IDs to indexEntries.
|
|
|
|
// It allows storing multiple entries with the same key.
|
|
|
|
//
|
|
|
|
// IndexMap uses some optimizations that are not compatible with supporting
|
|
|
|
// deletions.
|
|
|
|
//
|
|
|
|
// The buckets in this hash table contain only pointers, rather than inlined
|
|
|
|
// key-value pairs like the standard Go map. This way, only a pointer array
|
|
|
|
// needs to be resized when the table grows, preventing memory usage spikes.
|
|
|
|
type indexMap struct {
|
|
|
|
// The number of buckets is always a power of two and never zero.
|
|
|
|
buckets []*indexEntry
|
|
|
|
numentries uint
|
|
|
|
|
Replace siphash by hash/maphash
In Go 1.17.1, maphash has become quite a bit faster than siphash, so we
can drop one third-party dependency. maphash is just an interface to the
standard Go map's hash function, which we already trust for other use
cases.
Benchmark results on linux/amd64, -benchtime=3s:
name old time/op new time/op delta
IndexHasUnknown-8 50.6ns ±10% 41.0ns ±19% -18.92% (p=0.000 n=9+10)
IndexHasKnown-8 52.6ns ±12% 41.5ns ±12% -21.13% (p=0.000 n=9+10)
IndexMapHash-8 3.64µs ± 1% 2.00µs ± 0% -45.09% (p=0.000 n=10+9)
IndexAlloc-8 700ms ± 1% 601ms ± 6% -14.18% (p=0.000 n=8+10)
IndexAllocParallel-8 205ms ± 5% 192ms ± 8% -6.18% (p=0.043 n=10+10)
MasterIndexAlloc-8 319ms ± 1% 279ms ± 5% -12.58% (p=0.000 n=10+10)
MasterIndexLookupSingleIndex-8 156ns ± 8% 147ns ± 6% -5.46% (p=0.023 n=10+10)
MasterIndexLookupMultipleIndex-8 150ns ± 7% 142ns ± 8% -5.69% (p=0.007 n=10+10)
MasterIndexLookupSingleIndexUnknown-8 74.4ns ± 6% 72.0ns ± 9% ~ (p=0.175 n=10+9)
MasterIndexLookupMultipleIndexUnknown-8 67.4ns ± 9% 65.5ns ± 7% ~ (p=0.340 n=9+9)
MasterIndexLookupParallel/known,indices=25-8 461ns ± 2% 445ns ± 2% -3.49% (p=0.000 n=10+10)
MasterIndexLookupParallel/unknown,indices=25-8 408ns ±11% 378ns ± 5% -7.22% (p=0.035 n=10+9)
MasterIndexLookupParallel/known,indices=50-8 479ns ± 1% 437ns ± 4% -8.82% (p=0.000 n=10+10)
MasterIndexLookupParallel/unknown,indices=50-8 406ns ± 8% 343ns ±15% -15.44% (p=0.001 n=10+10)
MasterIndexLookupParallel/known,indices=100-8 480ns ± 1% 455ns ± 5% -5.15% (p=0.000 n=8+10)
MasterIndexLookupParallel/unknown,indices=100-8 391ns ±18% 382ns ± 8% ~ (p=0.315 n=10+10)
MasterIndexLookupBlobSize-8 71.0ns ± 8% 57.2ns ±11% -19.36% (p=0.000 n=9+10)
PackerManager-8 254ms ± 1% 254ms ± 1% ~ (p=0.285 n=15+15)
name old speed new speed delta
IndexMapHash-8 1.12GB/s ± 1% 2.05GB/s ± 0% +82.13% (p=0.000 n=10+9)
PackerManager-8 208MB/s ± 1% 207MB/s ± 1% ~ (p=0.281 n=15+15)
name old alloc/op new alloc/op delta
IndexMapHash-8 0.00B 0.00B ~ (all equal)
IndexAlloc-8 400MB ± 0% 400MB ± 0% ~ (p=1.000 n=9+10)
IndexAllocParallel-8 401MB ± 0% 401MB ± 0% +0.00% (p=0.000 n=10+10)
MasterIndexAlloc-8 258MB ± 0% 262MB ± 0% +1.42% (p=0.000 n=9+10)
PackerManager-8 73.1kB ± 0% 73.1kB ± 0% ~ (p=0.382 n=13+13)
name old allocs/op new allocs/op delta
IndexMapHash-8 0.00 0.00 ~ (all equal)
IndexAlloc-8 907k ± 0% 907k ± 0% -0.00% (p=0.000 n=10+10)
IndexAllocParallel-8 907k ± 0% 907k ± 0% +0.00% (p=0.009 n=10+10)
MasterIndexAlloc-8 327k ± 0% 317k ± 0% -3.06% (p=0.000 n=10+10)
PackerManager-8 744 ± 0% 744 ± 0% ~ (all equal)
2021-09-17 10:38:17 +00:00
|
|
|
mh maphash.Hash
|
2020-06-23 20:13:25 +00:00
|
|
|
|
|
|
|
free *indexEntry // Free list.
|
|
|
|
}
|
|
|
|
|
|
|
|
const (
|
|
|
|
growthFactor = 2 // Must be a power of 2.
|
|
|
|
maxLoad = 4 // Max. number of entries per bucket.
|
|
|
|
)
|
|
|
|
|
|
|
|
// add inserts an indexEntry for the given arguments into the map,
|
|
|
|
// using id as the key.
|
2022-02-13 16:24:09 +00:00
|
|
|
func (m *indexMap) add(id restic.ID, packIdx int, offset, length uint32, uncompressedLength uint32) {
|
2020-06-23 20:13:25 +00:00
|
|
|
switch {
|
|
|
|
case m.numentries == 0: // Lazy initialization.
|
|
|
|
m.init()
|
|
|
|
case m.numentries >= maxLoad*uint(len(m.buckets)):
|
|
|
|
m.grow()
|
|
|
|
}
|
|
|
|
|
|
|
|
h := m.hash(id)
|
|
|
|
e := m.newEntry()
|
|
|
|
e.id = id
|
|
|
|
e.next = m.buckets[h] // Prepend to existing chain.
|
|
|
|
e.packIndex = packIdx
|
|
|
|
e.offset = offset
|
|
|
|
e.length = length
|
2022-02-13 16:24:09 +00:00
|
|
|
e.uncompressedLength = uncompressedLength
|
2020-06-23 20:13:25 +00:00
|
|
|
|
|
|
|
m.buckets[h] = e
|
|
|
|
m.numentries++
|
|
|
|
}
|
|
|
|
|
|
|
|
// foreach calls fn for all entries in the map, until fn returns false.
|
|
|
|
func (m *indexMap) foreach(fn func(*indexEntry) bool) {
|
|
|
|
for _, e := range m.buckets {
|
|
|
|
for e != nil {
|
|
|
|
if !fn(e) {
|
|
|
|
return
|
|
|
|
}
|
|
|
|
e = e.next
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// foreachWithID calls fn for all entries with the given id.
|
|
|
|
func (m *indexMap) foreachWithID(id restic.ID, fn func(*indexEntry)) {
|
|
|
|
if len(m.buckets) == 0 {
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
|
|
|
h := m.hash(id)
|
|
|
|
for e := m.buckets[h]; e != nil; e = e.next {
|
|
|
|
if e.id != id {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
fn(e)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// get returns the first entry for the given id.
|
|
|
|
func (m *indexMap) get(id restic.ID) *indexEntry {
|
|
|
|
if len(m.buckets) == 0 {
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
h := m.hash(id)
|
|
|
|
for e := m.buckets[h]; e != nil; e = e.next {
|
|
|
|
if e.id == id {
|
|
|
|
return e
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
func (m *indexMap) grow() {
|
|
|
|
old := m.buckets
|
|
|
|
m.buckets = make([]*indexEntry, growthFactor*len(m.buckets))
|
|
|
|
|
|
|
|
for _, e := range old {
|
|
|
|
for e != nil {
|
|
|
|
h := m.hash(e.id)
|
|
|
|
next := e.next
|
|
|
|
e.next = m.buckets[h]
|
|
|
|
m.buckets[h] = e
|
|
|
|
e = next
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
func (m *indexMap) hash(id restic.ID) uint {
|
Replace siphash by hash/maphash
In Go 1.17.1, maphash has become quite a bit faster than siphash, so we
can drop one third-party dependency. maphash is just an interface to the
standard Go map's hash function, which we already trust for other use
cases.
Benchmark results on linux/amd64, -benchtime=3s:
name old time/op new time/op delta
IndexHasUnknown-8 50.6ns ±10% 41.0ns ±19% -18.92% (p=0.000 n=9+10)
IndexHasKnown-8 52.6ns ±12% 41.5ns ±12% -21.13% (p=0.000 n=9+10)
IndexMapHash-8 3.64µs ± 1% 2.00µs ± 0% -45.09% (p=0.000 n=10+9)
IndexAlloc-8 700ms ± 1% 601ms ± 6% -14.18% (p=0.000 n=8+10)
IndexAllocParallel-8 205ms ± 5% 192ms ± 8% -6.18% (p=0.043 n=10+10)
MasterIndexAlloc-8 319ms ± 1% 279ms ± 5% -12.58% (p=0.000 n=10+10)
MasterIndexLookupSingleIndex-8 156ns ± 8% 147ns ± 6% -5.46% (p=0.023 n=10+10)
MasterIndexLookupMultipleIndex-8 150ns ± 7% 142ns ± 8% -5.69% (p=0.007 n=10+10)
MasterIndexLookupSingleIndexUnknown-8 74.4ns ± 6% 72.0ns ± 9% ~ (p=0.175 n=10+9)
MasterIndexLookupMultipleIndexUnknown-8 67.4ns ± 9% 65.5ns ± 7% ~ (p=0.340 n=9+9)
MasterIndexLookupParallel/known,indices=25-8 461ns ± 2% 445ns ± 2% -3.49% (p=0.000 n=10+10)
MasterIndexLookupParallel/unknown,indices=25-8 408ns ±11% 378ns ± 5% -7.22% (p=0.035 n=10+9)
MasterIndexLookupParallel/known,indices=50-8 479ns ± 1% 437ns ± 4% -8.82% (p=0.000 n=10+10)
MasterIndexLookupParallel/unknown,indices=50-8 406ns ± 8% 343ns ±15% -15.44% (p=0.001 n=10+10)
MasterIndexLookupParallel/known,indices=100-8 480ns ± 1% 455ns ± 5% -5.15% (p=0.000 n=8+10)
MasterIndexLookupParallel/unknown,indices=100-8 391ns ±18% 382ns ± 8% ~ (p=0.315 n=10+10)
MasterIndexLookupBlobSize-8 71.0ns ± 8% 57.2ns ±11% -19.36% (p=0.000 n=9+10)
PackerManager-8 254ms ± 1% 254ms ± 1% ~ (p=0.285 n=15+15)
name old speed new speed delta
IndexMapHash-8 1.12GB/s ± 1% 2.05GB/s ± 0% +82.13% (p=0.000 n=10+9)
PackerManager-8 208MB/s ± 1% 207MB/s ± 1% ~ (p=0.281 n=15+15)
name old alloc/op new alloc/op delta
IndexMapHash-8 0.00B 0.00B ~ (all equal)
IndexAlloc-8 400MB ± 0% 400MB ± 0% ~ (p=1.000 n=9+10)
IndexAllocParallel-8 401MB ± 0% 401MB ± 0% +0.00% (p=0.000 n=10+10)
MasterIndexAlloc-8 258MB ± 0% 262MB ± 0% +1.42% (p=0.000 n=9+10)
PackerManager-8 73.1kB ± 0% 73.1kB ± 0% ~ (p=0.382 n=13+13)
name old allocs/op new allocs/op delta
IndexMapHash-8 0.00 0.00 ~ (all equal)
IndexAlloc-8 907k ± 0% 907k ± 0% -0.00% (p=0.000 n=10+10)
IndexAllocParallel-8 907k ± 0% 907k ± 0% +0.00% (p=0.009 n=10+10)
MasterIndexAlloc-8 327k ± 0% 317k ± 0% -3.06% (p=0.000 n=10+10)
PackerManager-8 744 ± 0% 744 ± 0% ~ (all equal)
2021-09-17 10:38:17 +00:00
|
|
|
// We use maphash to prevent backups of specially crafted inputs
|
|
|
|
// from degrading performance.
|
2020-06-23 20:13:25 +00:00
|
|
|
// While SHA-256 should be collision-resistant, for hash table indices
|
|
|
|
// we use only a few bits of it and finding collisions for those is
|
|
|
|
// much easier than breaking the whole algorithm.
|
Replace siphash by hash/maphash
In Go 1.17.1, maphash has become quite a bit faster than siphash, so we
can drop one third-party dependency. maphash is just an interface to the
standard Go map's hash function, which we already trust for other use
cases.
Benchmark results on linux/amd64, -benchtime=3s:
name old time/op new time/op delta
IndexHasUnknown-8 50.6ns ±10% 41.0ns ±19% -18.92% (p=0.000 n=9+10)
IndexHasKnown-8 52.6ns ±12% 41.5ns ±12% -21.13% (p=0.000 n=9+10)
IndexMapHash-8 3.64µs ± 1% 2.00µs ± 0% -45.09% (p=0.000 n=10+9)
IndexAlloc-8 700ms ± 1% 601ms ± 6% -14.18% (p=0.000 n=8+10)
IndexAllocParallel-8 205ms ± 5% 192ms ± 8% -6.18% (p=0.043 n=10+10)
MasterIndexAlloc-8 319ms ± 1% 279ms ± 5% -12.58% (p=0.000 n=10+10)
MasterIndexLookupSingleIndex-8 156ns ± 8% 147ns ± 6% -5.46% (p=0.023 n=10+10)
MasterIndexLookupMultipleIndex-8 150ns ± 7% 142ns ± 8% -5.69% (p=0.007 n=10+10)
MasterIndexLookupSingleIndexUnknown-8 74.4ns ± 6% 72.0ns ± 9% ~ (p=0.175 n=10+9)
MasterIndexLookupMultipleIndexUnknown-8 67.4ns ± 9% 65.5ns ± 7% ~ (p=0.340 n=9+9)
MasterIndexLookupParallel/known,indices=25-8 461ns ± 2% 445ns ± 2% -3.49% (p=0.000 n=10+10)
MasterIndexLookupParallel/unknown,indices=25-8 408ns ±11% 378ns ± 5% -7.22% (p=0.035 n=10+9)
MasterIndexLookupParallel/known,indices=50-8 479ns ± 1% 437ns ± 4% -8.82% (p=0.000 n=10+10)
MasterIndexLookupParallel/unknown,indices=50-8 406ns ± 8% 343ns ±15% -15.44% (p=0.001 n=10+10)
MasterIndexLookupParallel/known,indices=100-8 480ns ± 1% 455ns ± 5% -5.15% (p=0.000 n=8+10)
MasterIndexLookupParallel/unknown,indices=100-8 391ns ±18% 382ns ± 8% ~ (p=0.315 n=10+10)
MasterIndexLookupBlobSize-8 71.0ns ± 8% 57.2ns ±11% -19.36% (p=0.000 n=9+10)
PackerManager-8 254ms ± 1% 254ms ± 1% ~ (p=0.285 n=15+15)
name old speed new speed delta
IndexMapHash-8 1.12GB/s ± 1% 2.05GB/s ± 0% +82.13% (p=0.000 n=10+9)
PackerManager-8 208MB/s ± 1% 207MB/s ± 1% ~ (p=0.281 n=15+15)
name old alloc/op new alloc/op delta
IndexMapHash-8 0.00B 0.00B ~ (all equal)
IndexAlloc-8 400MB ± 0% 400MB ± 0% ~ (p=1.000 n=9+10)
IndexAllocParallel-8 401MB ± 0% 401MB ± 0% +0.00% (p=0.000 n=10+10)
MasterIndexAlloc-8 258MB ± 0% 262MB ± 0% +1.42% (p=0.000 n=9+10)
PackerManager-8 73.1kB ± 0% 73.1kB ± 0% ~ (p=0.382 n=13+13)
name old allocs/op new allocs/op delta
IndexMapHash-8 0.00 0.00 ~ (all equal)
IndexAlloc-8 907k ± 0% 907k ± 0% -0.00% (p=0.000 n=10+10)
IndexAllocParallel-8 907k ± 0% 907k ± 0% +0.00% (p=0.009 n=10+10)
MasterIndexAlloc-8 327k ± 0% 317k ± 0% -3.06% (p=0.000 n=10+10)
PackerManager-8 744 ± 0% 744 ± 0% ~ (all equal)
2021-09-17 10:38:17 +00:00
|
|
|
m.mh.Reset()
|
|
|
|
_, _ = m.mh.Write(id[:])
|
|
|
|
h := uint(m.mh.Sum64())
|
2020-06-23 20:13:25 +00:00
|
|
|
return h & uint(len(m.buckets)-1)
|
|
|
|
}
|
|
|
|
|
|
|
|
func (m *indexMap) init() {
|
|
|
|
const initialBuckets = 64
|
|
|
|
m.buckets = make([]*indexEntry, initialBuckets)
|
|
|
|
}
|
|
|
|
|
|
|
|
func (m *indexMap) len() uint { return m.numentries }
|
|
|
|
|
|
|
|
func (m *indexMap) newEntry() *indexEntry {
|
repository: Re-tune indexmap allocation strategy
fd05037e1ad7350d89ae91fdb9f52f954787bc4a changed the allocation batch
size from 256 to 128 under the assumption that an indexEntry is 60 bytes
on amd64, but it's 64: structs are padded out to a multiple of 8 for
alignment reasons. That means we'd waste no space in malloc even without
the batch allocation, at least on 64-bit machines. While that strategy
cuts the overallocation down dramatically for many small indexes, it also
seems to slow allocation down (Go 1.18, Linux, amd64, -benchtime=2s):
name old time/op new time/op delta
DecodeIndex-8 4.67s ± 5% 4.60s ± 1% ~ (p=0.953 n=10+5)
DecodeIndexParallel-8 4.67s ± 3% 4.60s ± 1% ~ (p=0.953 n=10+5)
IndexHasUnknown-8 37.8ns ± 8% 36.5ns ±14% ~ (p=0.841 n=5+5)
IndexHasKnown-8 38.5ns ±12% 37.7ns ±10% ~ (p=0.968 n=5+5)
IndexAlloc-8 615ms ±18% 607ms ± 1% ~ (p=1.000 n=10+5)
IndexAllocParallel-8 245ms ±11% 285ms ± 6% +16.40% (p=0.001 n=10+5)
MasterIndexAlloc-8 286ms ± 9% 275ms ± 2% ~ (p=1.000 n=10+5)
LoadIndex/v1-8 27.0ms ± 4% 26.8ms ± 1% ~ (p=0.690 n=5+5)
LoadIndex/v2-8 22.4ms ± 1% 22.8ms ± 2% +1.48% (p=0.016 n=5+5)
name old alloc/op new alloc/op delta
IndexAlloc-8 446MB ± 0% 446MB ± 0% -0.00% (p=0.000 n=8+4)
IndexAllocParallel-8 446MB ± 0% 446MB ± 0% -0.00% (p=0.008 n=8+5)
MasterIndexAlloc-8 213MB ± 0% 159MB ± 0% -25.47% (p=0.000 n=10+5)
name old allocs/op new allocs/op delta
IndexAlloc-8 913k ± 0% 2632k ± 0% +188.19% (p=0.008 n=5+5)
IndexAllocParallel-8 913k ± 0% 2632k ± 0% +188.21% (p=0.008 n=5+5)
MasterIndexAlloc-8 318k ± 0% 1172k ± 0% +267.86% (p=0.008 n=5+5)
Instead, this patch sets a batch size of 4, which means no space is
wasted by malloc on 64-bit and very little on 32-bit. It still gets very
close to the savings from not allocating in batches, without requiring
special code for bits.UintSize==64. Benchmark results, again for
Linux/amd64:
name old time/op new time/op delta
DecodeIndex-8 4.67s ± 5% 4.83s ± 9% ~ (p=0.315 n=10+10)
DecodeIndexParallel-8 4.67s ± 3% 4.68s ± 4% ~ (p=0.315 n=10+10)
IndexHasUnknown-8 37.8ns ± 8% 44.5ns ±19% ~ (p=0.095 n=5+5)
IndexHasKnown-8 38.5ns ±12% 36.9ns ± 8% ~ (p=0.690 n=5+5)
IndexAlloc-8 615ms ±18% 628ms ±18% ~ (p=0.218 n=10+10)
IndexAllocParallel-8 245ms ±11% 262ms ± 9% +7.02% (p=0.043 n=10+10)
MasterIndexAlloc-8 286ms ± 9% 287ms ±13% ~ (p=1.000 n=10+10)
LoadIndex/v1-8 27.0ms ± 4% 26.8ms ± 0% ~ (p=1.000 n=5+5)
LoadIndex/v2-8 22.4ms ± 1% 22.5ms ± 0% ~ (p=0.056 n=5+5)
name old alloc/op new alloc/op delta
IndexAlloc-8 446MB ± 0% 446MB ± 0% ~ (p=1.000 n=8+10)
IndexAllocParallel-8 446MB ± 0% 446MB ± 0% -0.00% (p=0.000 n=8+8)
MasterIndexAlloc-8 213MB ± 0% 160MB ± 0% -25.02% (p=0.000 n=10+9)
name old allocs/op new allocs/op delta
IndexAlloc-8 913k ± 0% 1333k ± 0% +45.94% (p=0.000 n=8+10)
IndexAllocParallel-8 913k ± 0% 1333k ± 0% +45.94% (p=0.000 n=8+8)
MasterIndexAlloc-8 318k ± 0% 525k ± 0% +64.99% (p=0.000 n=10+10)
The allocation method indexmap.newEntry has also been rewritten in a
form that is a few instructions shorter.
2022-05-11 18:53:21 +00:00
|
|
|
// We keep a free list of objects to speed up allocation and GC.
|
|
|
|
// There's an obvious trade-off here: allocating in larger batches
|
|
|
|
// means we allocate faster and the GC has to keep fewer bits to track
|
|
|
|
// what we have in use, but it means we waste some space.
|
2020-06-23 20:13:25 +00:00
|
|
|
//
|
repository: Re-tune indexmap allocation strategy
fd05037e1ad7350d89ae91fdb9f52f954787bc4a changed the allocation batch
size from 256 to 128 under the assumption that an indexEntry is 60 bytes
on amd64, but it's 64: structs are padded out to a multiple of 8 for
alignment reasons. That means we'd waste no space in malloc even without
the batch allocation, at least on 64-bit machines. While that strategy
cuts the overallocation down dramatically for many small indexes, it also
seems to slow allocation down (Go 1.18, Linux, amd64, -benchtime=2s):
name old time/op new time/op delta
DecodeIndex-8 4.67s ± 5% 4.60s ± 1% ~ (p=0.953 n=10+5)
DecodeIndexParallel-8 4.67s ± 3% 4.60s ± 1% ~ (p=0.953 n=10+5)
IndexHasUnknown-8 37.8ns ± 8% 36.5ns ±14% ~ (p=0.841 n=5+5)
IndexHasKnown-8 38.5ns ±12% 37.7ns ±10% ~ (p=0.968 n=5+5)
IndexAlloc-8 615ms ±18% 607ms ± 1% ~ (p=1.000 n=10+5)
IndexAllocParallel-8 245ms ±11% 285ms ± 6% +16.40% (p=0.001 n=10+5)
MasterIndexAlloc-8 286ms ± 9% 275ms ± 2% ~ (p=1.000 n=10+5)
LoadIndex/v1-8 27.0ms ± 4% 26.8ms ± 1% ~ (p=0.690 n=5+5)
LoadIndex/v2-8 22.4ms ± 1% 22.8ms ± 2% +1.48% (p=0.016 n=5+5)
name old alloc/op new alloc/op delta
IndexAlloc-8 446MB ± 0% 446MB ± 0% -0.00% (p=0.000 n=8+4)
IndexAllocParallel-8 446MB ± 0% 446MB ± 0% -0.00% (p=0.008 n=8+5)
MasterIndexAlloc-8 213MB ± 0% 159MB ± 0% -25.47% (p=0.000 n=10+5)
name old allocs/op new allocs/op delta
IndexAlloc-8 913k ± 0% 2632k ± 0% +188.19% (p=0.008 n=5+5)
IndexAllocParallel-8 913k ± 0% 2632k ± 0% +188.21% (p=0.008 n=5+5)
MasterIndexAlloc-8 318k ± 0% 1172k ± 0% +267.86% (p=0.008 n=5+5)
Instead, this patch sets a batch size of 4, which means no space is
wasted by malloc on 64-bit and very little on 32-bit. It still gets very
close to the savings from not allocating in batches, without requiring
special code for bits.UintSize==64. Benchmark results, again for
Linux/amd64:
name old time/op new time/op delta
DecodeIndex-8 4.67s ± 5% 4.83s ± 9% ~ (p=0.315 n=10+10)
DecodeIndexParallel-8 4.67s ± 3% 4.68s ± 4% ~ (p=0.315 n=10+10)
IndexHasUnknown-8 37.8ns ± 8% 44.5ns ±19% ~ (p=0.095 n=5+5)
IndexHasKnown-8 38.5ns ±12% 36.9ns ± 8% ~ (p=0.690 n=5+5)
IndexAlloc-8 615ms ±18% 628ms ±18% ~ (p=0.218 n=10+10)
IndexAllocParallel-8 245ms ±11% 262ms ± 9% +7.02% (p=0.043 n=10+10)
MasterIndexAlloc-8 286ms ± 9% 287ms ±13% ~ (p=1.000 n=10+10)
LoadIndex/v1-8 27.0ms ± 4% 26.8ms ± 0% ~ (p=1.000 n=5+5)
LoadIndex/v2-8 22.4ms ± 1% 22.5ms ± 0% ~ (p=0.056 n=5+5)
name old alloc/op new alloc/op delta
IndexAlloc-8 446MB ± 0% 446MB ± 0% ~ (p=1.000 n=8+10)
IndexAllocParallel-8 446MB ± 0% 446MB ± 0% -0.00% (p=0.000 n=8+8)
MasterIndexAlloc-8 213MB ± 0% 160MB ± 0% -25.02% (p=0.000 n=10+9)
name old allocs/op new allocs/op delta
IndexAlloc-8 913k ± 0% 1333k ± 0% +45.94% (p=0.000 n=8+10)
IndexAllocParallel-8 913k ± 0% 1333k ± 0% +45.94% (p=0.000 n=8+8)
MasterIndexAlloc-8 318k ± 0% 525k ± 0% +64.99% (p=0.000 n=10+10)
The allocation method indexmap.newEntry has also been rewritten in a
form that is a few instructions shorter.
2022-05-11 18:53:21 +00:00
|
|
|
// Then again, allocating each indexEntry separately also wastes space
|
|
|
|
// on 32-bit platforms, because the Go malloc has no size class for
|
|
|
|
// exactly 52 bytes, so it puts the indexEntry in a 64-byte slot instead.
|
|
|
|
// See src/runtime/sizeclasses.go in the Go source repo.
|
|
|
|
//
|
|
|
|
// The batch size of 4 means we hit the size classes for 4×64=256 bytes
|
|
|
|
// (64-bit) and 4×52=208 bytes (32-bit), wasting nothing in malloc on
|
|
|
|
// 64-bit and relatively little on 32-bit.
|
|
|
|
const entryAllocBatch = 4
|
2020-06-23 20:13:25 +00:00
|
|
|
|
repository: Re-tune indexmap allocation strategy
fd05037e1ad7350d89ae91fdb9f52f954787bc4a changed the allocation batch
size from 256 to 128 under the assumption that an indexEntry is 60 bytes
on amd64, but it's 64: structs are padded out to a multiple of 8 for
alignment reasons. That means we'd waste no space in malloc even without
the batch allocation, at least on 64-bit machines. While that strategy
cuts the overallocation down dramatically for many small indexes, it also
seems to slow allocation down (Go 1.18, Linux, amd64, -benchtime=2s):
name old time/op new time/op delta
DecodeIndex-8 4.67s ± 5% 4.60s ± 1% ~ (p=0.953 n=10+5)
DecodeIndexParallel-8 4.67s ± 3% 4.60s ± 1% ~ (p=0.953 n=10+5)
IndexHasUnknown-8 37.8ns ± 8% 36.5ns ±14% ~ (p=0.841 n=5+5)
IndexHasKnown-8 38.5ns ±12% 37.7ns ±10% ~ (p=0.968 n=5+5)
IndexAlloc-8 615ms ±18% 607ms ± 1% ~ (p=1.000 n=10+5)
IndexAllocParallel-8 245ms ±11% 285ms ± 6% +16.40% (p=0.001 n=10+5)
MasterIndexAlloc-8 286ms ± 9% 275ms ± 2% ~ (p=1.000 n=10+5)
LoadIndex/v1-8 27.0ms ± 4% 26.8ms ± 1% ~ (p=0.690 n=5+5)
LoadIndex/v2-8 22.4ms ± 1% 22.8ms ± 2% +1.48% (p=0.016 n=5+5)
name old alloc/op new alloc/op delta
IndexAlloc-8 446MB ± 0% 446MB ± 0% -0.00% (p=0.000 n=8+4)
IndexAllocParallel-8 446MB ± 0% 446MB ± 0% -0.00% (p=0.008 n=8+5)
MasterIndexAlloc-8 213MB ± 0% 159MB ± 0% -25.47% (p=0.000 n=10+5)
name old allocs/op new allocs/op delta
IndexAlloc-8 913k ± 0% 2632k ± 0% +188.19% (p=0.008 n=5+5)
IndexAllocParallel-8 913k ± 0% 2632k ± 0% +188.21% (p=0.008 n=5+5)
MasterIndexAlloc-8 318k ± 0% 1172k ± 0% +267.86% (p=0.008 n=5+5)
Instead, this patch sets a batch size of 4, which means no space is
wasted by malloc on 64-bit and very little on 32-bit. It still gets very
close to the savings from not allocating in batches, without requiring
special code for bits.UintSize==64. Benchmark results, again for
Linux/amd64:
name old time/op new time/op delta
DecodeIndex-8 4.67s ± 5% 4.83s ± 9% ~ (p=0.315 n=10+10)
DecodeIndexParallel-8 4.67s ± 3% 4.68s ± 4% ~ (p=0.315 n=10+10)
IndexHasUnknown-8 37.8ns ± 8% 44.5ns ±19% ~ (p=0.095 n=5+5)
IndexHasKnown-8 38.5ns ±12% 36.9ns ± 8% ~ (p=0.690 n=5+5)
IndexAlloc-8 615ms ±18% 628ms ±18% ~ (p=0.218 n=10+10)
IndexAllocParallel-8 245ms ±11% 262ms ± 9% +7.02% (p=0.043 n=10+10)
MasterIndexAlloc-8 286ms ± 9% 287ms ±13% ~ (p=1.000 n=10+10)
LoadIndex/v1-8 27.0ms ± 4% 26.8ms ± 0% ~ (p=1.000 n=5+5)
LoadIndex/v2-8 22.4ms ± 1% 22.5ms ± 0% ~ (p=0.056 n=5+5)
name old alloc/op new alloc/op delta
IndexAlloc-8 446MB ± 0% 446MB ± 0% ~ (p=1.000 n=8+10)
IndexAllocParallel-8 446MB ± 0% 446MB ± 0% -0.00% (p=0.000 n=8+8)
MasterIndexAlloc-8 213MB ± 0% 160MB ± 0% -25.02% (p=0.000 n=10+9)
name old allocs/op new allocs/op delta
IndexAlloc-8 913k ± 0% 1333k ± 0% +45.94% (p=0.000 n=8+10)
IndexAllocParallel-8 913k ± 0% 1333k ± 0% +45.94% (p=0.000 n=8+8)
MasterIndexAlloc-8 318k ± 0% 525k ± 0% +64.99% (p=0.000 n=10+10)
The allocation method indexmap.newEntry has also been rewritten in a
form that is a few instructions shorter.
2022-05-11 18:53:21 +00:00
|
|
|
e := m.free
|
|
|
|
if e != nil {
|
|
|
|
m.free = e.next
|
|
|
|
} else {
|
2020-06-23 20:13:25 +00:00
|
|
|
free := new([entryAllocBatch]indexEntry)
|
repository: Re-tune indexmap allocation strategy
fd05037e1ad7350d89ae91fdb9f52f954787bc4a changed the allocation batch
size from 256 to 128 under the assumption that an indexEntry is 60 bytes
on amd64, but it's 64: structs are padded out to a multiple of 8 for
alignment reasons. That means we'd waste no space in malloc even without
the batch allocation, at least on 64-bit machines. While that strategy
cuts the overallocation down dramatically for many small indexes, it also
seems to slow allocation down (Go 1.18, Linux, amd64, -benchtime=2s):
name old time/op new time/op delta
DecodeIndex-8 4.67s ± 5% 4.60s ± 1% ~ (p=0.953 n=10+5)
DecodeIndexParallel-8 4.67s ± 3% 4.60s ± 1% ~ (p=0.953 n=10+5)
IndexHasUnknown-8 37.8ns ± 8% 36.5ns ±14% ~ (p=0.841 n=5+5)
IndexHasKnown-8 38.5ns ±12% 37.7ns ±10% ~ (p=0.968 n=5+5)
IndexAlloc-8 615ms ±18% 607ms ± 1% ~ (p=1.000 n=10+5)
IndexAllocParallel-8 245ms ±11% 285ms ± 6% +16.40% (p=0.001 n=10+5)
MasterIndexAlloc-8 286ms ± 9% 275ms ± 2% ~ (p=1.000 n=10+5)
LoadIndex/v1-8 27.0ms ± 4% 26.8ms ± 1% ~ (p=0.690 n=5+5)
LoadIndex/v2-8 22.4ms ± 1% 22.8ms ± 2% +1.48% (p=0.016 n=5+5)
name old alloc/op new alloc/op delta
IndexAlloc-8 446MB ± 0% 446MB ± 0% -0.00% (p=0.000 n=8+4)
IndexAllocParallel-8 446MB ± 0% 446MB ± 0% -0.00% (p=0.008 n=8+5)
MasterIndexAlloc-8 213MB ± 0% 159MB ± 0% -25.47% (p=0.000 n=10+5)
name old allocs/op new allocs/op delta
IndexAlloc-8 913k ± 0% 2632k ± 0% +188.19% (p=0.008 n=5+5)
IndexAllocParallel-8 913k ± 0% 2632k ± 0% +188.21% (p=0.008 n=5+5)
MasterIndexAlloc-8 318k ± 0% 1172k ± 0% +267.86% (p=0.008 n=5+5)
Instead, this patch sets a batch size of 4, which means no space is
wasted by malloc on 64-bit and very little on 32-bit. It still gets very
close to the savings from not allocating in batches, without requiring
special code for bits.UintSize==64. Benchmark results, again for
Linux/amd64:
name old time/op new time/op delta
DecodeIndex-8 4.67s ± 5% 4.83s ± 9% ~ (p=0.315 n=10+10)
DecodeIndexParallel-8 4.67s ± 3% 4.68s ± 4% ~ (p=0.315 n=10+10)
IndexHasUnknown-8 37.8ns ± 8% 44.5ns ±19% ~ (p=0.095 n=5+5)
IndexHasKnown-8 38.5ns ±12% 36.9ns ± 8% ~ (p=0.690 n=5+5)
IndexAlloc-8 615ms ±18% 628ms ±18% ~ (p=0.218 n=10+10)
IndexAllocParallel-8 245ms ±11% 262ms ± 9% +7.02% (p=0.043 n=10+10)
MasterIndexAlloc-8 286ms ± 9% 287ms ±13% ~ (p=1.000 n=10+10)
LoadIndex/v1-8 27.0ms ± 4% 26.8ms ± 0% ~ (p=1.000 n=5+5)
LoadIndex/v2-8 22.4ms ± 1% 22.5ms ± 0% ~ (p=0.056 n=5+5)
name old alloc/op new alloc/op delta
IndexAlloc-8 446MB ± 0% 446MB ± 0% ~ (p=1.000 n=8+10)
IndexAllocParallel-8 446MB ± 0% 446MB ± 0% -0.00% (p=0.000 n=8+8)
MasterIndexAlloc-8 213MB ± 0% 160MB ± 0% -25.02% (p=0.000 n=10+9)
name old allocs/op new allocs/op delta
IndexAlloc-8 913k ± 0% 1333k ± 0% +45.94% (p=0.000 n=8+10)
IndexAllocParallel-8 913k ± 0% 1333k ± 0% +45.94% (p=0.000 n=8+8)
MasterIndexAlloc-8 318k ± 0% 525k ± 0% +64.99% (p=0.000 n=10+10)
The allocation method indexmap.newEntry has also been rewritten in a
form that is a few instructions shorter.
2022-05-11 18:53:21 +00:00
|
|
|
e = &free[0]
|
|
|
|
for i := 1; i < len(free)-1; i++ {
|
2020-06-23 20:13:25 +00:00
|
|
|
free[i].next = &free[i+1]
|
|
|
|
}
|
repository: Re-tune indexmap allocation strategy
fd05037e1ad7350d89ae91fdb9f52f954787bc4a changed the allocation batch
size from 256 to 128 under the assumption that an indexEntry is 60 bytes
on amd64, but it's 64: structs are padded out to a multiple of 8 for
alignment reasons. That means we'd waste no space in malloc even without
the batch allocation, at least on 64-bit machines. While that strategy
cuts the overallocation down dramatically for many small indexes, it also
seems to slow allocation down (Go 1.18, Linux, amd64, -benchtime=2s):
name old time/op new time/op delta
DecodeIndex-8 4.67s ± 5% 4.60s ± 1% ~ (p=0.953 n=10+5)
DecodeIndexParallel-8 4.67s ± 3% 4.60s ± 1% ~ (p=0.953 n=10+5)
IndexHasUnknown-8 37.8ns ± 8% 36.5ns ±14% ~ (p=0.841 n=5+5)
IndexHasKnown-8 38.5ns ±12% 37.7ns ±10% ~ (p=0.968 n=5+5)
IndexAlloc-8 615ms ±18% 607ms ± 1% ~ (p=1.000 n=10+5)
IndexAllocParallel-8 245ms ±11% 285ms ± 6% +16.40% (p=0.001 n=10+5)
MasterIndexAlloc-8 286ms ± 9% 275ms ± 2% ~ (p=1.000 n=10+5)
LoadIndex/v1-8 27.0ms ± 4% 26.8ms ± 1% ~ (p=0.690 n=5+5)
LoadIndex/v2-8 22.4ms ± 1% 22.8ms ± 2% +1.48% (p=0.016 n=5+5)
name old alloc/op new alloc/op delta
IndexAlloc-8 446MB ± 0% 446MB ± 0% -0.00% (p=0.000 n=8+4)
IndexAllocParallel-8 446MB ± 0% 446MB ± 0% -0.00% (p=0.008 n=8+5)
MasterIndexAlloc-8 213MB ± 0% 159MB ± 0% -25.47% (p=0.000 n=10+5)
name old allocs/op new allocs/op delta
IndexAlloc-8 913k ± 0% 2632k ± 0% +188.19% (p=0.008 n=5+5)
IndexAllocParallel-8 913k ± 0% 2632k ± 0% +188.21% (p=0.008 n=5+5)
MasterIndexAlloc-8 318k ± 0% 1172k ± 0% +267.86% (p=0.008 n=5+5)
Instead, this patch sets a batch size of 4, which means no space is
wasted by malloc on 64-bit and very little on 32-bit. It still gets very
close to the savings from not allocating in batches, without requiring
special code for bits.UintSize==64. Benchmark results, again for
Linux/amd64:
name old time/op new time/op delta
DecodeIndex-8 4.67s ± 5% 4.83s ± 9% ~ (p=0.315 n=10+10)
DecodeIndexParallel-8 4.67s ± 3% 4.68s ± 4% ~ (p=0.315 n=10+10)
IndexHasUnknown-8 37.8ns ± 8% 44.5ns ±19% ~ (p=0.095 n=5+5)
IndexHasKnown-8 38.5ns ±12% 36.9ns ± 8% ~ (p=0.690 n=5+5)
IndexAlloc-8 615ms ±18% 628ms ±18% ~ (p=0.218 n=10+10)
IndexAllocParallel-8 245ms ±11% 262ms ± 9% +7.02% (p=0.043 n=10+10)
MasterIndexAlloc-8 286ms ± 9% 287ms ±13% ~ (p=1.000 n=10+10)
LoadIndex/v1-8 27.0ms ± 4% 26.8ms ± 0% ~ (p=1.000 n=5+5)
LoadIndex/v2-8 22.4ms ± 1% 22.5ms ± 0% ~ (p=0.056 n=5+5)
name old alloc/op new alloc/op delta
IndexAlloc-8 446MB ± 0% 446MB ± 0% ~ (p=1.000 n=8+10)
IndexAllocParallel-8 446MB ± 0% 446MB ± 0% -0.00% (p=0.000 n=8+8)
MasterIndexAlloc-8 213MB ± 0% 160MB ± 0% -25.02% (p=0.000 n=10+9)
name old allocs/op new allocs/op delta
IndexAlloc-8 913k ± 0% 1333k ± 0% +45.94% (p=0.000 n=8+10)
IndexAllocParallel-8 913k ± 0% 1333k ± 0% +45.94% (p=0.000 n=8+8)
MasterIndexAlloc-8 318k ± 0% 525k ± 0% +64.99% (p=0.000 n=10+10)
The allocation method indexmap.newEntry has also been rewritten in a
form that is a few instructions shorter.
2022-05-11 18:53:21 +00:00
|
|
|
m.free = &free[1]
|
2020-06-23 20:13:25 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
return e
|
|
|
|
}
|
|
|
|
|
|
|
|
type indexEntry struct {
|
2022-02-13 16:24:09 +00:00
|
|
|
id restic.ID
|
|
|
|
next *indexEntry
|
|
|
|
packIndex int // Position in containing Index's packs field.
|
|
|
|
offset uint32
|
|
|
|
length uint32
|
|
|
|
uncompressedLength uint32
|
2020-06-23 20:13:25 +00:00
|
|
|
}
|