c68e38b943
Right now AVX2 implementation looses to C binding in speed. This is probably, because of 2 things: 1. Go does not inline `mulBitRightx2` in loop iteration. 2. `minmax` is loaded every time from memory. In this PR: 1. Unroll `mulBitRightx2` manually and use `mulByteRightx2` instead. 2. Generate `minmax` in place without `LOAD/LEA` instructions.
61 lines
1.5 KiB
Go
61 lines
1.5 KiB
Go
package tz
|
|
|
|
import (
|
|
"hash"
|
|
|
|
"github.com/nspcc-dev/tzhash/gf127"
|
|
)
|
|
|
|
type digest2 struct {
|
|
x [2]gf127.GF127x2
|
|
}
|
|
|
|
var _ hash.Hash = (*digest2)(nil)
|
|
|
|
func NewAVX2() hash.Hash {
|
|
d := new(digest2)
|
|
d.Reset()
|
|
return d
|
|
}
|
|
|
|
func (d *digest2) Write(data []byte) (n int, err error) {
|
|
n = len(data)
|
|
for _, b := range data {
|
|
mulBitRightx2(&d.x[0], &d.x[1], &minmax[(b>>7)&1])
|
|
mulBitRightx2(&d.x[0], &d.x[1], &minmax[(b>>6)&1])
|
|
mulBitRightx2(&d.x[0], &d.x[1], &minmax[(b>>5)&1])
|
|
mulBitRightx2(&d.x[0], &d.x[1], &minmax[(b>>4)&1])
|
|
mulBitRightx2(&d.x[0], &d.x[1], &minmax[(b>>3)&1])
|
|
mulBitRightx2(&d.x[0], &d.x[1], &minmax[(b>>2)&1])
|
|
mulBitRightx2(&d.x[0], &d.x[1], &minmax[(b>>1)&1])
|
|
mulBitRightx2(&d.x[0], &d.x[1], &minmax[(b>>0)&1])
|
|
}
|
|
return
|
|
}
|
|
|
|
func (d *digest2) Sum(in []byte) []byte {
|
|
// Make a copy of d so that caller can keep writing and summing.
|
|
d0 := *d
|
|
h := d0.checkSum()
|
|
return append(in, h[:]...)
|
|
}
|
|
func (d *digest2) Reset() {
|
|
d.x[0] = gf127.GF127x2{1, 0, 0, 0}
|
|
d.x[1] = gf127.GF127x2{0, 0, 1, 0}
|
|
}
|
|
func (d *digest2) Size() int { return hashSize }
|
|
func (d *digest2) BlockSize() int { return hashBlockSize }
|
|
func (d *digest2) checkSum() (b [hashSize]byte) {
|
|
// Matrix is stored transposed,
|
|
// but we need to use order consistent with digest.
|
|
h := d.x[0].ByteArray()
|
|
copy(b[:], h[:16])
|
|
copy(b[32:], h[16:])
|
|
|
|
h = d.x[1].ByteArray()
|
|
copy(b[16:], h[:16])
|
|
copy(b[48:], h[16:])
|
|
return
|
|
}
|
|
|
|
func mulBitRightx2(c00c10 *gf127.GF127x2, c01c11 *gf127.GF127x2, e *gf127.GF127)
|