forked from TrueCloudLab/restic
chunker: Refactor to take polynomial on creation
This commit is contained in:
parent
3cdf3a25b9
commit
367cc75970
2 changed files with 75 additions and 51 deletions
|
@ -1,9 +1,9 @@
|
||||||
package chunker
|
package chunker
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"errors"
|
||||||
"hash"
|
"hash"
|
||||||
"io"
|
"io"
|
||||||
"sync"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
const (
|
const (
|
||||||
|
@ -23,20 +23,6 @@ const (
|
||||||
splitmask = (1 << AverageBits) - 1
|
splitmask = (1 << AverageBits) - 1
|
||||||
)
|
)
|
||||||
|
|
||||||
var (
|
|
||||||
// pol is a randomly generated irreducible polynomial of degree 53
|
|
||||||
// in Z_2[X]. All rabin fingerprints are calculated with this polynomial.
|
|
||||||
pol = uint64(0x3DA3358B4DC173)
|
|
||||||
|
|
||||||
pol_shift = deg(pol) - 8
|
|
||||||
once sync.Once
|
|
||||||
mod_table [256]uint64
|
|
||||||
out_table [256]uint64
|
|
||||||
|
|
||||||
// tables have been filled, do not allow changing the polynom afterwards
|
|
||||||
filled bool
|
|
||||||
)
|
|
||||||
|
|
||||||
// A chunk is one content-dependent chunk of bytes whose end was cut when the
|
// A chunk is one content-dependent chunk of bytes whose end was cut when the
|
||||||
// Rabin Fingerprint had the value stored in Cut.
|
// Rabin Fingerprint had the value stored in Cut.
|
||||||
type Chunk struct {
|
type Chunk struct {
|
||||||
|
@ -52,6 +38,11 @@ func (c Chunk) Reader(r io.ReaderAt) io.Reader {
|
||||||
|
|
||||||
// A chunker internally holds everything needed to split content.
|
// A chunker internally holds everything needed to split content.
|
||||||
type Chunker struct {
|
type Chunker struct {
|
||||||
|
pol Pol
|
||||||
|
pol_shift uint
|
||||||
|
mod_table [256]uint64
|
||||||
|
out_table [256]uint64
|
||||||
|
|
||||||
rd io.Reader
|
rd io.Reader
|
||||||
closed bool
|
closed bool
|
||||||
|
|
||||||
|
@ -72,28 +63,24 @@ type Chunker struct {
|
||||||
h hash.Hash
|
h hash.Hash
|
||||||
}
|
}
|
||||||
|
|
||||||
// Polynomial sets the polynomial that is to be used for calculating the rabin
|
// New returns a new Chunker based on polynomial p that reads from data from rd
|
||||||
// fingerprints. This function must be called before the first chunker is
|
// with bufsize and pass all data to hash along the way.
|
||||||
// created, otherwise the results are undefined.
|
func New(rd io.Reader, p Pol, bufsize int, hash hash.Hash) (*Chunker, error) {
|
||||||
func SetPolynomial(f uint64) {
|
// test irreducibility of p again
|
||||||
if filled {
|
if !p.Irreducible() {
|
||||||
panic("polynomial changed after chunker has already been used")
|
return nil, errors.New("invalid polynomial")
|
||||||
}
|
}
|
||||||
pol = f
|
|
||||||
}
|
|
||||||
|
|
||||||
// New returns a new Chunker that reads from data from rd with bufsize and pass
|
|
||||||
// all data to hash along the way.
|
|
||||||
func New(rd io.Reader, bufsize int, hash hash.Hash) *Chunker {
|
|
||||||
once.Do(fill_tables)
|
|
||||||
|
|
||||||
c := &Chunker{
|
c := &Chunker{
|
||||||
|
pol: p,
|
||||||
|
pol_shift: uint(p.Deg() - 8),
|
||||||
buf: make([]byte, bufsize),
|
buf: make([]byte, bufsize),
|
||||||
h: hash,
|
h: hash,
|
||||||
}
|
}
|
||||||
|
c.fill_tables()
|
||||||
c.Reset(rd)
|
c.Reset(rd)
|
||||||
|
|
||||||
return c
|
return c, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// Reset restarts a chunker so that it can be reused with a different reader as
|
// Reset restarts a chunker so that it can be reused with a different reader as
|
||||||
|
@ -121,9 +108,7 @@ func (c *Chunker) Reset(rd io.Reader) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Calculate out_table and mod_table for optimization. Must be called only once.
|
// Calculate out_table and mod_table for optimization. Must be called only once.
|
||||||
func fill_tables() {
|
func (c *Chunker) fill_tables() {
|
||||||
filled = true
|
|
||||||
|
|
||||||
// calculate table for sliding out bytes. The byte to slide out is used as
|
// calculate table for sliding out bytes. The byte to slide out is used as
|
||||||
// the index for the table, the value contains the following:
|
// the index for the table, the value contains the following:
|
||||||
// out_table[b] = Hash(b || 0 || ... || 0)
|
// out_table[b] = Hash(b || 0 || ... || 0)
|
||||||
|
@ -138,15 +123,15 @@ func fill_tables() {
|
||||||
for b := 0; b < 256; b++ {
|
for b := 0; b < 256; b++ {
|
||||||
var hash uint64
|
var hash uint64
|
||||||
|
|
||||||
hash = append_byte(hash, byte(b), pol)
|
hash = append_byte(hash, byte(b), uint64(c.pol))
|
||||||
for i := 0; i < WindowSize-1; i++ {
|
for i := 0; i < WindowSize-1; i++ {
|
||||||
hash = append_byte(hash, 0, pol)
|
hash = append_byte(hash, 0, uint64(c.pol))
|
||||||
}
|
}
|
||||||
out_table[b] = hash
|
c.out_table[b] = hash
|
||||||
}
|
}
|
||||||
|
|
||||||
// calculate table for reduction mod Polynomial
|
// calculate table for reduction mod Polynomial
|
||||||
k := deg(pol)
|
k := c.pol.Deg()
|
||||||
for b := 0; b < 256; b++ {
|
for b := 0; b < 256; b++ {
|
||||||
// mod_table[b] = A | B, where A = (b(x) * x^k mod pol) and B = b(x) * x^k
|
// mod_table[b] = A | B, where A = (b(x) * x^k mod pol) and B = b(x) * x^k
|
||||||
//
|
//
|
||||||
|
@ -155,7 +140,7 @@ func fill_tables() {
|
||||||
// two parts: Part A contains the result of the modulus operation, part
|
// two parts: Part A contains the result of the modulus operation, part
|
||||||
// B is used to cancel out the 8 top bits so that one XOR operation is
|
// B is used to cancel out the 8 top bits so that one XOR operation is
|
||||||
// enough to reduce modulo Polynomial
|
// enough to reduce modulo Polynomial
|
||||||
mod_table[b] = mod(uint64(b)<<uint(k), pol) | (uint64(b) << uint(k))
|
c.mod_table[b] = mod(uint64(b)<<uint(k), uint64(c.pol)) | (uint64(b) << uint(k))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -226,15 +211,15 @@ func (c *Chunker) Next() (*Chunk, error) {
|
||||||
// inline c.slide(b) and append(b) to increase performance
|
// inline c.slide(b) and append(b) to increase performance
|
||||||
out := c.window[c.wpos]
|
out := c.window[c.wpos]
|
||||||
c.window[c.wpos] = b
|
c.window[c.wpos] = b
|
||||||
c.digest ^= out_table[out]
|
c.digest ^= c.out_table[out]
|
||||||
c.wpos = (c.wpos + 1) % WindowSize
|
c.wpos = (c.wpos + 1) % WindowSize
|
||||||
|
|
||||||
// c.append(b)
|
// c.append(b)
|
||||||
index := c.digest >> uint(pol_shift)
|
index := c.digest >> c.pol_shift
|
||||||
c.digest <<= 8
|
c.digest <<= 8
|
||||||
c.digest |= uint64(b)
|
c.digest |= uint64(b)
|
||||||
|
|
||||||
c.digest ^= mod_table[index]
|
c.digest ^= c.mod_table[index]
|
||||||
// end inline
|
// end inline
|
||||||
|
|
||||||
add++
|
add++
|
||||||
|
@ -300,17 +285,17 @@ func (c *Chunker) hashDigest() []byte {
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *Chunker) append(b byte) {
|
func (c *Chunker) append(b byte) {
|
||||||
index := c.digest >> uint(pol_shift)
|
index := c.digest >> c.pol_shift
|
||||||
c.digest <<= 8
|
c.digest <<= 8
|
||||||
c.digest |= uint64(b)
|
c.digest |= uint64(b)
|
||||||
|
|
||||||
c.digest ^= mod_table[index]
|
c.digest ^= c.mod_table[index]
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *Chunker) slide(b byte) {
|
func (c *Chunker) slide(b byte) {
|
||||||
out := c.window[c.wpos]
|
out := c.window[c.wpos]
|
||||||
c.window[c.wpos] = b
|
c.window[c.wpos] = b
|
||||||
c.digest ^= out_table[out]
|
c.digest ^= c.out_table[out]
|
||||||
c.wpos = (c.wpos + 1) % WindowSize
|
c.wpos = (c.wpos + 1) % WindowSize
|
||||||
|
|
||||||
c.append(b)
|
c.append(b)
|
||||||
|
|
|
@ -12,6 +12,7 @@ import (
|
||||||
"math/rand"
|
"math/rand"
|
||||||
"os"
|
"os"
|
||||||
"testing"
|
"testing"
|
||||||
|
"time"
|
||||||
|
|
||||||
"github.com/restic/restic/chunker"
|
"github.com/restic/restic/chunker"
|
||||||
)
|
)
|
||||||
|
@ -34,6 +35,9 @@ type chunk struct {
|
||||||
Digest []byte
|
Digest []byte
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// polynomial used for all the tests below
|
||||||
|
const testPol = chunker.Pol(0x3DA3358B4DC173)
|
||||||
|
|
||||||
// created for 32MB of random data out of math/rand's Uint32() seeded by
|
// created for 32MB of random data out of math/rand's Uint32() seeded by
|
||||||
// constant 23
|
// constant 23
|
||||||
//
|
//
|
||||||
|
@ -146,7 +150,8 @@ func get_random(seed, count int) []byte {
|
||||||
func TestChunker(t *testing.T) {
|
func TestChunker(t *testing.T) {
|
||||||
// setup data source
|
// setup data source
|
||||||
buf := get_random(23, 32*1024*1024)
|
buf := get_random(23, 32*1024*1024)
|
||||||
ch := chunker.New(bytes.NewReader(buf), *testBufSize, sha256.New())
|
ch, err := chunker.New(bytes.NewReader(buf), testPol, *testBufSize, sha256.New())
|
||||||
|
ok(t, err)
|
||||||
chunks := test_with_data(t, ch, chunks1)
|
chunks := test_with_data(t, ch, chunks1)
|
||||||
|
|
||||||
// test reader
|
// test reader
|
||||||
|
@ -173,15 +178,43 @@ func TestChunker(t *testing.T) {
|
||||||
|
|
||||||
// setup nullbyte data source
|
// setup nullbyte data source
|
||||||
buf = bytes.Repeat([]byte{0}, len(chunks2)*chunker.MinSize)
|
buf = bytes.Repeat([]byte{0}, len(chunks2)*chunker.MinSize)
|
||||||
ch = chunker.New(bytes.NewReader(buf), *testBufSize, sha256.New())
|
ch, err = chunker.New(bytes.NewReader(buf), testPol, *testBufSize, sha256.New())
|
||||||
|
ok(t, err)
|
||||||
|
|
||||||
test_with_data(t, ch, chunks2)
|
test_with_data(t, ch, chunks2)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestChunkerWithRandomPolynomial(t *testing.T) {
|
||||||
|
// setup data source
|
||||||
|
buf := get_random(23, 32*1024*1024)
|
||||||
|
|
||||||
|
// generate a new random polynomial
|
||||||
|
start := time.Now()
|
||||||
|
p, err := chunker.RandomPolynomial()
|
||||||
|
ok(t, err)
|
||||||
|
t.Logf("generating random polynomial took %v", time.Since(start))
|
||||||
|
|
||||||
|
start = time.Now()
|
||||||
|
ch, err := chunker.New(bytes.NewReader(buf), p, *testBufSize, sha256.New())
|
||||||
|
ok(t, err)
|
||||||
|
t.Logf("creating chunker took %v", time.Since(start))
|
||||||
|
|
||||||
|
// make sure that first chunk is different
|
||||||
|
c, err := ch.Next()
|
||||||
|
|
||||||
|
assert(t, c.Cut != chunks1[0].CutFP,
|
||||||
|
"Cut point is the same")
|
||||||
|
assert(t, c.Length != chunks1[0].Length,
|
||||||
|
"Length is the same")
|
||||||
|
assert(t, !bytes.Equal(c.Digest, chunks1[0].Digest),
|
||||||
|
"Digest is the same")
|
||||||
|
}
|
||||||
|
|
||||||
func TestChunkerWithoutHash(t *testing.T) {
|
func TestChunkerWithoutHash(t *testing.T) {
|
||||||
// setup data source
|
// setup data source
|
||||||
buf := get_random(23, 32*1024*1024)
|
buf := get_random(23, 32*1024*1024)
|
||||||
ch := chunker.New(bytes.NewReader(buf), *testBufSize, nil)
|
ch, err := chunker.New(bytes.NewReader(buf), testPol, *testBufSize, nil)
|
||||||
|
ok(t, err)
|
||||||
chunks := test_with_data(t, ch, chunks1)
|
chunks := test_with_data(t, ch, chunks1)
|
||||||
|
|
||||||
// test reader
|
// test reader
|
||||||
|
@ -211,14 +244,16 @@ func TestChunkerWithoutHash(t *testing.T) {
|
||||||
|
|
||||||
// setup nullbyte data source
|
// setup nullbyte data source
|
||||||
buf = bytes.Repeat([]byte{0}, len(chunks2)*chunker.MinSize)
|
buf = bytes.Repeat([]byte{0}, len(chunks2)*chunker.MinSize)
|
||||||
ch = chunker.New(bytes.NewReader(buf), *testBufSize, sha256.New())
|
ch, err = chunker.New(bytes.NewReader(buf), testPol, *testBufSize, sha256.New())
|
||||||
|
ok(t, err)
|
||||||
|
|
||||||
test_with_data(t, ch, chunks2)
|
test_with_data(t, ch, chunks2)
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestChunkerReuse(t *testing.T) {
|
func TestChunkerReuse(t *testing.T) {
|
||||||
// test multiple uses of the same chunker
|
// test multiple uses of the same chunker
|
||||||
ch := chunker.New(nil, *testBufSize, sha256.New())
|
ch, err := chunker.New(nil, testPol, *testBufSize, sha256.New())
|
||||||
|
ok(t, err)
|
||||||
buf := get_random(23, 32*1024*1024)
|
buf := get_random(23, 32*1024*1024)
|
||||||
|
|
||||||
for i := 0; i < 4; i++ {
|
for i := 0; i < 4; i++ {
|
||||||
|
@ -254,9 +289,13 @@ func benchmarkChunker(b *testing.B, hash hash.Hash) {
|
||||||
rd = bytes.NewReader(get_random(23, size))
|
rd = bytes.NewReader(get_random(23, size))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
t1 := time.Now()
|
||||||
|
ch, err := chunker.New(rd, testPol, *testBufSize, hash)
|
||||||
|
ok(b, err)
|
||||||
|
b.Logf("generating tables took %v", time.Since(t1))
|
||||||
|
|
||||||
b.ResetTimer()
|
b.ResetTimer()
|
||||||
b.SetBytes(int64(size))
|
b.SetBytes(int64(size))
|
||||||
ch := chunker.New(rd, *testBufSize, hash)
|
|
||||||
|
|
||||||
var chunks int
|
var chunks int
|
||||||
for i := 0; i < b.N; i++ {
|
for i := 0; i < b.N; i++ {
|
||||||
|
|
Loading…
Reference in a new issue