Update chunker

This commit is contained in:
Alexander Neumann 2016-03-05 13:46:20 +01:00
parent 4dac6d45fd
commit 9e24238cdd
2 changed files with 88 additions and 57 deletions

2
vendor/manifest vendored
View file

@ -40,7 +40,7 @@
{ {
"importpath": "github.com/restic/chunker", "importpath": "github.com/restic/chunker",
"repository": "https://github.com/restic/chunker", "repository": "https://github.com/restic/chunker",
"revision": "16c849a106e0a50d658e8f5e49a01f6728f4f92c", "revision": "d1a97fa6e55ab338a8ceb769d72f856a56e9379a",
"branch": "master" "branch": "master"
}, },
{ {

View file

@ -33,12 +33,12 @@ type tables struct {
// cache precomputed tables, these are read-only anyway // cache precomputed tables, these are read-only anyway
var cache struct { var cache struct {
entries map[Pol]*tables entries map[Pol]tables
sync.Mutex sync.Mutex
} }
func init() { func init() {
cache.entries = make(map[Pol]*tables) cache.entries = make(map[Pol]tables)
} }
// Chunk is one content-dependent chunk of bytes whose end was cut when the // Chunk is one content-dependent chunk of bytes whose end was cut when the
@ -50,17 +50,7 @@ type Chunk struct {
Data []byte Data []byte
} }
// Chunker splits content with Rabin Fingerprints. type chunkerState struct {
type Chunker struct {
MinSize, MaxSize uint
pol Pol
polShift uint
tables *tables
rd io.Reader
closed bool
window [windowSize]byte window [windowSize]byte
wpos int wpos int
@ -77,15 +67,37 @@ type Chunker struct {
digest uint64 digest uint64
} }
type chunkerConfig struct {
MinSize, MaxSize uint
pol Pol
polShift uint
tables tables
tablesInitialized bool
rd io.Reader
closed bool
}
// Chunker splits content with Rabin Fingerprints.
type Chunker struct {
chunkerConfig
chunkerState
}
// New returns a new Chunker based on polynomial p that reads from rd // New returns a new Chunker based on polynomial p that reads from rd
// with bufsize and pass all data to hash along the way. // with bufsize and pass all data to hash along the way.
func New(rd io.Reader, pol Pol) *Chunker { func New(rd io.Reader, pol Pol) *Chunker {
c := &Chunker{ c := &Chunker{
buf: make([]byte, chunkerBufSize), chunkerState: chunkerState{
pol: pol, buf: make([]byte, chunkerBufSize),
rd: rd, },
MinSize: MinSize, chunkerConfig: chunkerConfig{
MaxSize: MaxSize, pol: pol,
rd: rd,
MinSize: MinSize,
MaxSize: MaxSize,
},
} }
c.reset() c.reset()
@ -96,11 +108,15 @@ func New(rd io.Reader, pol Pol) *Chunker {
// Reset reinitializes the chunker with a new reader and polynomial. // Reset reinitializes the chunker with a new reader and polynomial.
func (c *Chunker) Reset(rd io.Reader, pol Pol) { func (c *Chunker) Reset(rd io.Reader, pol Pol) {
*c = Chunker{ *c = Chunker{
buf: c.buf, chunkerState: chunkerState{
pol: pol, buf: c.buf,
rd: rd, },
MinSize: c.MinSize, chunkerConfig: chunkerConfig{
MaxSize: c.MaxSize, pol: pol,
rd: rd,
MinSize: MinSize,
MaxSize: MaxSize,
},
} }
c.reset() c.reset()
@ -118,7 +134,7 @@ func (c *Chunker) reset() {
c.digest = 0 c.digest = 0
c.wpos = 0 c.wpos = 0
c.count = 0 c.count = 0
c.slide(1) c.digest = c.slide(c.digest, 1)
c.start = c.pos c.start = c.pos
// do not start a new chunk unless at least MinSize bytes have been read // do not start a new chunk unless at least MinSize bytes have been read
@ -133,6 +149,8 @@ func (c *Chunker) fillTables() {
return return
} }
c.tablesInitialized = true
// test if the tables are cached for this polynomial // test if the tables are cached for this polynomial
cache.Lock() cache.Lock()
defer cache.Unlock() defer cache.Unlock()
@ -141,10 +159,6 @@ func (c *Chunker) fillTables() {
return return
} }
// else create a new entry
c.tables = &tables{}
cache.entries[c.pol] = c.tables
// calculate table for sliding out bytes. The byte to slide out is used as // calculate table for sliding out bytes. The byte to slide out is used as
// the index for the table, the value contains the following: // the index for the table, the value contains the following:
// out_table[b] = Hash(b || 0 || ... || 0) // out_table[b] = Hash(b || 0 || ... || 0)
@ -178,6 +192,8 @@ func (c *Chunker) fillTables() {
// enough to reduce modulo Polynomial // enough to reduce modulo Polynomial
c.tables.mod[b] = Pol(uint64(b)<<uint(k)).Mod(c.pol) | (Pol(b) << uint(k)) c.tables.mod[b] = Pol(uint64(b)<<uint(k)).Mod(c.pol) | (Pol(b) << uint(k))
} }
cache.entries[c.pol] = c.tables
} }
// Next returns the position and length of the next chunk of data. If an error // Next returns the position and length of the next chunk of data. If an error
@ -186,13 +202,19 @@ func (c *Chunker) fillTables() {
// subsequent calls yield an io.EOF error. // subsequent calls yield an io.EOF error.
func (c *Chunker) Next(data []byte) (Chunk, error) { func (c *Chunker) Next(data []byte) (Chunk, error) {
data = data[:0] data = data[:0]
if c.tables == nil { if !c.tablesInitialized {
return Chunk{}, errors.New("polynomial is not set") return Chunk{}, errors.New("tables for polynomial computation not initialized")
} }
tabout := c.tables.out
tabmod := c.tables.mod
polShift := c.polShift
minSize := c.MinSize
maxSize := c.MaxSize
buf := c.buf
for { for {
if c.bpos >= c.bmax { if c.bpos >= c.bmax {
n, err := io.ReadFull(c.rd, c.buf[:]) n, err := io.ReadFull(c.rd, buf[:])
if err == io.ErrUnexpectedEOF { if err == io.ErrUnexpectedEOF {
err = nil err = nil
@ -230,7 +252,7 @@ func (c *Chunker) Next(data []byte) (Chunk, error) {
n := c.bmax - c.bpos n := c.bmax - c.bpos
if c.pre > uint(n) { if c.pre > uint(n) {
c.pre -= uint(n) c.pre -= uint(n)
data = append(data, c.buf[c.bpos:c.bmax]...) data = append(data, buf[c.bpos:c.bmax]...)
c.count += uint(n) c.count += uint(n)
c.pos += uint(n) c.pos += uint(n)
@ -239,7 +261,7 @@ func (c *Chunker) Next(data []byte) (Chunk, error) {
continue continue
} }
data = append(data, c.buf[c.bpos:c.bpos+c.pre]...) data = append(data, buf[c.bpos:c.bpos+c.pre]...)
c.bpos += c.pre c.bpos += c.pre
c.count += c.pre c.count += c.pre
@ -248,37 +270,41 @@ func (c *Chunker) Next(data []byte) (Chunk, error) {
} }
add := c.count add := c.count
for _, b := range c.buf[c.bpos:c.bmax] { digest := c.digest
// inline c.slide(b) and append(b) to increase performance win := c.window
out := c.window[c.wpos] wpos := c.wpos
c.window[c.wpos] = b for _, b := range buf[c.bpos:c.bmax] {
c.digest ^= uint64(c.tables.out[out]) // slide(b)
c.wpos = (c.wpos + 1) % windowSize out := win[wpos]
win[wpos] = b
digest ^= uint64(tabout[out])
wpos = (wpos + 1) % windowSize
// c.append(b) // updateDigest
index := c.digest >> c.polShift index := byte(digest >> polShift)
c.digest <<= 8 digest <<= 8
c.digest |= uint64(b) digest |= uint64(b)
c.digest ^= uint64(c.tables.mod[index]) digest ^= uint64(tabmod[index])
// end inline // end manual inline
add++ add++
if add < c.MinSize { if add < minSize {
continue continue
} }
if (c.digest&splitmask) == 0 || add >= MaxSize { if (digest&splitmask) == 0 || add >= maxSize {
i := add - c.count - 1 i := add - c.count - 1
data = append(data, c.buf[c.bpos:c.bpos+uint(i)+1]...) data = append(data, c.buf[c.bpos:c.bpos+uint(i)+1]...)
c.count = add c.count = add
c.pos += uint(i) + 1 c.pos += uint(i) + 1
c.bpos += uint(i) + 1 c.bpos += uint(i) + 1
c.buf = buf
chunk := Chunk{ chunk := Chunk{
Start: c.start, Start: c.start,
Length: c.count, Length: c.count,
Cut: c.digest, Cut: digest,
Data: data, Data: data,
} }
@ -287,6 +313,9 @@ func (c *Chunker) Next(data []byte) (Chunk, error) {
return chunk, nil return chunk, nil
} }
} }
c.digest = digest
c.window = win
c.wpos = wpos
steps := c.bmax - c.bpos steps := c.bmax - c.bpos
if steps > 0 { if steps > 0 {
@ -298,21 +327,23 @@ func (c *Chunker) Next(data []byte) (Chunk, error) {
} }
} }
func (c *Chunker) append(b byte) { func updateDigest(digest uint64, polShift uint, tab tables, b byte) (newDigest uint64) {
index := c.digest >> c.polShift index := digest >> polShift
c.digest <<= 8 digest <<= 8
c.digest |= uint64(b) digest |= uint64(b)
c.digest ^= uint64(c.tables.mod[index]) digest ^= uint64(tab.mod[index])
return digest
} }
func (c *Chunker) slide(b byte) { func (c *Chunker) slide(digest uint64, b byte) (newDigest uint64) {
out := c.window[c.wpos] out := c.window[c.wpos]
c.window[c.wpos] = b c.window[c.wpos] = b
c.digest ^= uint64(c.tables.out[out]) digest ^= uint64(c.tables.out[out])
c.wpos = (c.wpos + 1) % windowSize c.wpos = (c.wpos + 1) % windowSize
c.append(b) digest = updateDigest(digest, c.polShift, c.tables, b)
return digest
} }
func appendByte(hash Pol, b byte, pol Pol) Pol { func appendByte(hash Pol, b byte, pol Pol) Pol {