diff --git a/README.md b/README.md index 58b0b03..d9cd741 100644 --- a/README.md +++ b/README.md @@ -26,6 +26,15 @@ are concatable: hash sum of data can be calculated based on hashes of chunks. The example of how it works can be seen in tests. +# Benchmarks + +## AVX vs AVX2 version + +``` +BenchmarkAVX-8 300 3566248 ns/op 64 B/op 4 allocs/op +BenchmarkAVX2-8 500 2857174 ns/op 64 B/op 2 allocs/op +``` + # Contributing At this moment, we do not accept contributions. Follow us. diff --git a/tz/hash.go b/tz/hash.go index dc46290..b8b9e50 100644 --- a/tz/hash.go +++ b/tz/hash.go @@ -21,7 +21,7 @@ type digest struct { } // type assertion -var _ hash.Hash = new(digest) +var _ hash.Hash = (*digest)(nil) var ( minmax = [2]gf127.GF127{{0, 0}, {math.MaxUint64, math.MaxUint64}} @@ -47,11 +47,10 @@ func (d *digest) checkSum() [hashSize]byte { } func (d *digest) byteArray() (b [hashSize]byte) { - var t []byte - for i := 0; i < 4; i++ { - t = d.x[i].ByteArray() - copy(b[i*16:], t) - } + copy(b[:], d.x[0].ByteArray()) + copy(b[16:], d.x[1].ByteArray()) + copy(b[32:], d.x[2].ByteArray()) + copy(b[48:], d.x[3].ByteArray()) return } @@ -85,11 +84,20 @@ func (d *digest) BlockSize() int { return hashBlockSize } -// Sum returnz Tillich-Zémor checksum of data -func Sum(data []byte) [hashSize]byte { +// Sum returnz Tillich-Zémor checksum of data. +// It uses only AVX instructions (no AVX2). +func SumAVX(data []byte) [hashSize]byte { d := new(digest) d.Reset() - d.Write(data) + _, _ = d.Write(data) // no errors + return d.checkSum() +} + +// Sum returns Tillich-Zémor checksum of data. +func Sum(data []byte) [hashSize]byte { + d := new(digest2) + d.Reset() + _, _ = d.Write(data) // no errors return d.checkSum() } diff --git a/tz/hash_avx2.go b/tz/hash_avx2.go new file mode 100644 index 0000000..5d43516 --- /dev/null +++ b/tz/hash_avx2.go @@ -0,0 +1,55 @@ +package tz + +import ( + "hash" + + "github.com/nspcc-dev/tzhash/gf127" +) + +type digest2 struct { + x [2]gf127.GF127x2 +} + +var _ hash.Hash = (*digest2)(nil) + +func (d *digest2) Write(data []byte) (n int, err error) { + n = len(data) + for _, b := range data { + mulBitRightx2(&d.x[0], &d.x[1], &minmax[(b>>7)&1]) + mulBitRightx2(&d.x[0], &d.x[1], &minmax[(b>>6)&1]) + mulBitRightx2(&d.x[0], &d.x[1], &minmax[(b>>5)&1]) + mulBitRightx2(&d.x[0], &d.x[1], &minmax[(b>>4)&1]) + mulBitRightx2(&d.x[0], &d.x[1], &minmax[(b>>3)&1]) + mulBitRightx2(&d.x[0], &d.x[1], &minmax[(b>>2)&1]) + mulBitRightx2(&d.x[0], &d.x[1], &minmax[(b>>1)&1]) + mulBitRightx2(&d.x[0], &d.x[1], &minmax[(b>>0)&1]) + } + return +} + +func (d *digest2) Sum(in []byte) []byte { + // Make a copy of d so that caller can keep writing and summing. + d0 := *d + h := d0.checkSum() + return append(in, h[:]...) +} +func (d *digest2) Reset() { + d.x[0] = gf127.GF127x2{1, 0, 0, 0} + d.x[1] = gf127.GF127x2{0, 0, 1, 0} +} +func (d *digest2) Size() int { return hashSize } +func (d *digest2) BlockSize() int { return hashBlockSize } +func (d *digest2) checkSum() (b [hashSize]byte) { + // Matrix is stored transposed, + // but we need to use order consistent with digest. + h := d.x[0].ByteArray() + copy(b[:], h[:16]) + copy(b[32:], h[16:]) + + h = d.x[1].ByteArray() + copy(b[16:], h[:16]) + copy(b[48:], h[16:]) + return +} + +func mulBitRightx2(c00c10 *gf127.GF127x2, c01c11 *gf127.GF127x2, e *gf127.GF127) diff --git a/tz/hash_test.go b/tz/hash_test.go index a9d5105..e52ed90 100644 --- a/tz/hash_test.go +++ b/tz/hash_test.go @@ -2,13 +2,95 @@ package tz import ( "encoding/hex" + "io" "math/rand" "testing" "github.com/stretchr/testify/require" ) +const benchDataSize = 100000 + +var testCases = []struct { + input []byte + hash string +}{ + { + []byte{}, + "00000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001", + }, + { + []byte{0, 1, 2, 3, 4, 5, 6, 7, 8}, + "00000000000001e4a545e5b90fb6882b00000000000000c849cd88f79307f67100000000000000cd0c898cb68356e624000000000000007cbcdc7c5e89b16e4b", + }, + { + []byte{4, 8, 15, 16, 23, 42, 255, 0, 127, 65, 32, 123, 42, 45, 201, 210, 213, 244}, + "4db8a8e253903c70ab0efb65fe6de05a36d1dc9f567a147152d0148a86817b2062908d9b026a506007c1118e86901b672a39317c55ee3c10ac8efafa79efe8ee", + }, +} + func TestHash(t *testing.T) { + t.Run("test AVX digest", func(t *testing.T) { + d := new(digest) + for _, tc := range testCases { + d.Reset() + _, _ = d.Write(tc.input) + sum := d.checkSum() + + require.Equal(t, tc.hash, hex.EncodeToString(sum[:])) + } + }) + + t.Run("test AVX2 digest", func(t *testing.T) { + d := new(digest2) + for _, tc := range testCases { + d.Reset() + _, _ = d.Write(tc.input) + sum := d.checkSum() + + require.Equal(t, tc.hash, hex.EncodeToString(sum[:])) + } + }) +} + +func newBuffer() (data []byte) { + data = make([]byte, benchDataSize) + + r := rand.New(rand.NewSource(0)) + _, err := io.ReadFull(r, data) + if err != nil { + panic("cant initialize buffer") + } + return +} + +func BenchmarkAVX(b *testing.B) { + data := newBuffer() + + b.ResetTimer() + b.ReportAllocs() + d := new(digest) + for i := 0; i < b.N; i++ { + d.Reset() + _, _ = d.Write(data) + d.checkSum() + } +} + +func BenchmarkAVX2(b *testing.B) { + data := newBuffer() + + b.ResetTimer() + b.ReportAllocs() + d := new(digest2) + for i := 0; i < b.N; i++ { + d.Reset() + _, _ = d.Write(data) + d.checkSum() + } +} + +func TestHomomorphism(t *testing.T) { var ( c1, c2 sl2 n int @@ -36,7 +118,7 @@ func TestHash(t *testing.T) { require.Equal(t, h, c1.ByteArray()) } -var testCases = []struct { +var testCasesConcat = []struct { Hash string Parts []string }{{ @@ -62,7 +144,7 @@ func TestConcat(t *testing.T) { err error ) - for _, tc := range testCases { + for _, tc := range testCasesConcat { expect, err = hex.DecodeString(tc.Hash) require.NoError(t, err) @@ -86,7 +168,7 @@ func TestValidate(t *testing.T) { err error ) - for _, tc := range testCases { + for _, tc := range testCasesConcat { hash, _ = hex.DecodeString(tc.Hash) require.NoError(t, err) diff --git a/tz/tzbits_amd64.s b/tz/tzbits_amd64.s index 7ad3a0e..063a154 100644 --- a/tz/tzbits_amd64.s +++ b/tz/tzbits_amd64.s @@ -60,3 +60,30 @@ TEXT ·mulBitRight(SB),NOSPLIT,$0 XORPD X9, X3 MOVUPD X3, (DX) RET + + +// func mulBitRightx2(c00c10, c01c11 *[4]uint64, e *[2]uint64) +TEXT ·mulBitRightx2(SB),NOSPLIT,$0 + MOVQ c00c10+0(FP), AX + VMOVDQA (AX), Y0 + MOVQ c01c11+8(FP), BX + VMOVDQA (BX), Y8 + + VPSLLQ $1, Y0, Y1 + VPALIGNR $8, Y1, Y0, Y2 + VPSRLQ $63, Y2, Y2 + VPXOR Y1, Y2, Y2 + VPSRLQ $63, Y1, Y3 + VPSLLQ $63, Y3, Y3 + VPUNPCKHQDQ Y3, Y3, Y3 + VPXOR Y2, Y3, Y3 + + MOVQ e+16(FP), CX + VBROADCASTI128 (CX), Y2 + + VPXOR Y3, Y8, Y3 + VPAND Y3, Y2, Y4 + VPXOR Y4, Y0, Y8 + VMOVDQA Y8, (BX) + VMOVDQA Y3, (AX) + RET