Optimize chunker

Skip MinSize bytes at the beginning of each chunk. This increases
throughput about 30% percent.

Before:
    $ go test -v -bench . ./chunker
    === RUN TestChunker
    --- PASS: TestChunker (0.60s)
    === RUN TestChunkerReuse
    --- PASS: TestChunkerReuse (2.18s)
    PASS
    BenchmarkChunker          10     111229642 ns/op      94.27 MB/s
    --- BENCH: BenchmarkChunker
        chunker_test.go:168: 6 chunks, average chunk size: 1747626 bytes
        chunker_test.go:168: 6 chunks, average chunk size: 1747626 bytes
    ok      github.com/restic/restic/chunker    4.120s

After:
    $ go test -v -bench . ./chunker
    === RUN TestChunker
    --- PASS: TestChunker (0.48s)
    === RUN TestChunkerReuse
    --- PASS: TestChunkerReuse (1.75s)
    PASS
    BenchmarkChunker          20      81468596 ns/op     128.71 MB/s
    --- BENCH: BenchmarkChunker
        chunker_test.go:168: 6 chunks, average chunk size: 1747626 bytes
        chunker_test.go:168: 6 chunks, average chunk size: 1747626 bytes
    ok      github.com/restic/restic/chunker    4.061s
This commit is contained in:
Alexander Neumann 2015-01-14 16:33:41 +01:00
parent cdf3336e7a
commit bdcdcdea7d
2 changed files with 29 additions and 3 deletions

View file

@ -66,6 +66,8 @@ type Chunker struct {
count int
pos int
pre int // wait for this many bytes before start calculating an new chunk
digest uint64
}
@ -96,6 +98,8 @@ func (c *Chunker) reset() {
c.pos = 0
c.count = 0
c.slide(1)
// do not start a new chunk unless at least MinSize bytes have been read
c.pre = MinSize - WindowSize
}
// Calculate out_table and mod_table for optimization. Must be called only once.
@ -141,6 +145,7 @@ func (c *Chunker) fill_tables() {
// calls yield a nil chunk and an io.EOF error.
func (c *Chunker) Next(dst []byte) (*Chunk, error) {
dst = dst[:0]
for {
if c.bpos >= c.bmax {
n, err := io.ReadFull(c.rd, c.buf)
@ -176,6 +181,26 @@ func (c *Chunker) Next(dst []byte) (*Chunk, error) {
c.bmax = n
}
// check if bytes have to be dismissed before starting a new chunk
if c.pre > 0 {
n := c.bmax - c.bpos
if c.pre > n {
c.pre -= n
dst = append(dst, c.buf[c.bpos:c.bmax]...)
c.count += n
c.pos += n
c.bpos = c.bmax
continue
}
dst = append(dst, c.buf[c.bpos:c.bpos+c.pre]...)
c.bpos += c.pre
c.count += c.pre
c.pos += c.pre
c.pre = 0
}
for i, b := range c.buf[c.bpos:c.bmax] {
// inline c.slide(b) and append(b) to increase performance
out := c.window[c.wpos]
@ -208,6 +233,7 @@ func (c *Chunker) Next(dst []byte) (*Chunk, error) {
c.reset()
c.pos = pos
c.start = pos
c.pre = MinSize - WindowSize
return chunk, nil
}

View file

@ -43,7 +43,7 @@ var chunks1 = []chunk{
chunk{800374, 0x000968473f900000},
chunk{2453512, 0x001e197c92600000},
chunk{2651975, 0x000ae6c868000000},
chunk{237392, 0x00184c5825e18636},
chunk{237392, 0x0000000000000001},
}
// test if nullbytes are correctly split, even if length is a multiple of MinSize.
@ -79,8 +79,8 @@ func test_with_data(t *testing.T, chnker *chunker.Chunker, chunks []chunk) {
}
if c.Cut != chunk.CutFP {
t.Fatalf("Cut fingerprint for chunk %d does not match: expected %016x, got %016x",
i, chunk.CutFP, c.Cut)
t.Fatalf("Cut fingerprint for chunk %d/%d does not match: expected %016x, got %016x",
i, len(chunks)-1, chunk.CutFP, c.Cut)
}
}
}