Merge pull request #5 from nspcc-dev/feat/sl2avx2

Make use of AVX2 in Sum() by default Add benchmarks regarding AVX vs. AVX2 performance.
2019-06-24 10:34:58 +03:00 · 2019-06-24 10:34:58 +03:00 · 500c652dcc
commit 500c652dcc
parent f197b9e890 ec6649ba1c
5 changed files with 193 additions and 12 deletions
--- a/README.md
+++ b/README.md
@ -26,6 +26,15 @@ are concatable: hash sum of data can be calculated based on hashes of chunks.

 The example of how it works can be seen in tests.

+# Benchmarks
+
+## AVX vs AVX2 version
+
+```
+BenchmarkAVX-8          300     3566248 ns/op          64 B/op          4 allocs/op
+BenchmarkAVX2-8         500     2857174 ns/op          64 B/op          2 allocs/op
+```
+
 # Contributing

 At this moment, we do not accept contributions. Follow us.
--- a/tz/hash.go
+++ b/tz/hash.go
@ -21,7 +21,7 @@ type digest struct {
 }

 // type assertion
-var _ hash.Hash = new(digest)
+var _ hash.Hash = (*digest)(nil)

 var (
 	minmax  = [2]gf127.GF127{{0, 0}, {math.MaxUint64, math.MaxUint64}}
@ -47,11 +47,10 @@ func (d *digest) checkSum() [hashSize]byte {
 }

 func (d *digest) byteArray() (b [hashSize]byte) {
-	var t []byte
-	for i := 0; i < 4; i++ {
-		t = d.x[i].ByteArray()
-		copy(b[i*16:], t)
-	}
+	copy(b[:], d.x[0].ByteArray())
+	copy(b[16:], d.x[1].ByteArray())
+	copy(b[32:], d.x[2].ByteArray())
+	copy(b[48:], d.x[3].ByteArray())
 	return
 }

@ -85,11 +84,20 @@ func (d *digest) BlockSize() int {
 	return hashBlockSize
 }

-// Sum returnz Tillich-Zémor checksum of data
-func Sum(data []byte) [hashSize]byte {
+// Sum returnz Tillich-Zémor checksum of data.
+// It uses only AVX instructions (no AVX2).
+func SumAVX(data []byte) [hashSize]byte {
 	d := new(digest)
 	d.Reset()
-	d.Write(data)
+	_, _ = d.Write(data) // no errors
+	return d.checkSum()
+}
+
+// Sum returns Tillich-Zémor checksum of data.
+func Sum(data []byte) [hashSize]byte {
+	d := new(digest2)
+	d.Reset()
+	_, _ = d.Write(data) // no errors
 	return d.checkSum()
 }

--- a/tz/hash_avx2.go
+++ b/tz/hash_avx2.go
@ -0,0 +1,55 @@
+package tz
+
+import (
+	"hash"
+
+	"github.com/nspcc-dev/tzhash/gf127"
+)
+
+type digest2 struct {
+	x [2]gf127.GF127x2
+}
+
+var _ hash.Hash = (*digest2)(nil)
+
+func (d *digest2) Write(data []byte) (n int, err error) {
+	n = len(data)
+	for _, b := range data {
+		mulBitRightx2(&d.x[0], &d.x[1], &minmax[(b>>7)&1])
+		mulBitRightx2(&d.x[0], &d.x[1], &minmax[(b>>6)&1])
+		mulBitRightx2(&d.x[0], &d.x[1], &minmax[(b>>5)&1])
+		mulBitRightx2(&d.x[0], &d.x[1], &minmax[(b>>4)&1])
+		mulBitRightx2(&d.x[0], &d.x[1], &minmax[(b>>3)&1])
+		mulBitRightx2(&d.x[0], &d.x[1], &minmax[(b>>2)&1])
+		mulBitRightx2(&d.x[0], &d.x[1], &minmax[(b>>1)&1])
+		mulBitRightx2(&d.x[0], &d.x[1], &minmax[(b>>0)&1])
+	}
+	return
+}
+
+func (d *digest2) Sum(in []byte) []byte {
+	// Make a copy of d so that caller can keep writing and summing.
+	d0 := *d
+	h := d0.checkSum()
+	return append(in, h[:]...)
+}
+func (d *digest2) Reset() {
+	d.x[0] = gf127.GF127x2{1, 0, 0, 0}
+	d.x[1] = gf127.GF127x2{0, 0, 1, 0}
+}
+func (d *digest2) Size() int      { return hashSize }
+func (d *digest2) BlockSize() int { return hashBlockSize }
+func (d *digest2) checkSum() (b [hashSize]byte) {
+	// Matrix is stored transposed,
+	// but we need to use order consistent with digest.
+	h := d.x[0].ByteArray()
+	copy(b[:], h[:16])
+	copy(b[32:], h[16:])
+
+	h = d.x[1].ByteArray()
+	copy(b[16:], h[:16])
+	copy(b[48:], h[16:])
+	return
+}
+
+func mulBitRightx2(c00c10 *gf127.GF127x2, c01c11 *gf127.GF127x2, e *gf127.GF127)
--- a/tz/hash_test.go
+++ b/tz/hash_test.go
@ -2,13 +2,95 @@ package tz

 import (
 	"encoding/hex"
+	"io"
 	"math/rand"
 	"testing"

 	"github.com/stretchr/testify/require"
 )

+const benchDataSize = 100000
+
+var testCases = []struct {
+	input []byte
+	hash  string
+}{
+	{
+		[]byte{},
+		"00000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001",
+	},
+	{
+		[]byte{0, 1, 2, 3, 4, 5, 6, 7, 8},
+		"00000000000001e4a545e5b90fb6882b00000000000000c849cd88f79307f67100000000000000cd0c898cb68356e624000000000000007cbcdc7c5e89b16e4b",
+	},
+	{
+		[]byte{4, 8, 15, 16, 23, 42, 255, 0, 127, 65, 32, 123, 42, 45, 201, 210, 213, 244},
+		"4db8a8e253903c70ab0efb65fe6de05a36d1dc9f567a147152d0148a86817b2062908d9b026a506007c1118e86901b672a39317c55ee3c10ac8efafa79efe8ee",
+	},
+}
+
 func TestHash(t *testing.T) {
+	t.Run("test AVX digest", func(t *testing.T) {
+		d := new(digest)
+		for _, tc := range testCases {
+			d.Reset()
+			_, _ = d.Write(tc.input)
+			sum := d.checkSum()
+
+			require.Equal(t, tc.hash, hex.EncodeToString(sum[:]))
+		}
+	})
+
+	t.Run("test AVX2 digest", func(t *testing.T) {
+		d := new(digest2)
+		for _, tc := range testCases {
+			d.Reset()
+			_, _ = d.Write(tc.input)
+			sum := d.checkSum()
+
+			require.Equal(t, tc.hash, hex.EncodeToString(sum[:]))
+		}
+	})
+}
+
+func newBuffer() (data []byte) {
+	data = make([]byte, benchDataSize)
+
+	r := rand.New(rand.NewSource(0))
+	_, err := io.ReadFull(r, data)
+	if err != nil {
+		panic("cant initialize buffer")
+	}
+	return
+}
+
+func BenchmarkAVX(b *testing.B) {
+	data := newBuffer()
+
+	b.ResetTimer()
+	b.ReportAllocs()
+	d := new(digest)
+	for i := 0; i < b.N; i++ {
+		d.Reset()
+		_, _ = d.Write(data)
+		d.checkSum()
+	}
+}
+
+func BenchmarkAVX2(b *testing.B) {
+	data := newBuffer()
+
+	b.ResetTimer()
+	b.ReportAllocs()
+	d := new(digest2)
+	for i := 0; i < b.N; i++ {
+		d.Reset()
+		_, _ = d.Write(data)
+		d.checkSum()
+	}
+}
+
+func TestHomomorphism(t *testing.T) {
 	var (
 		c1, c2    sl2
 		n         int
@ -36,7 +118,7 @@ func TestHash(t *testing.T) {
 	require.Equal(t, h, c1.ByteArray())
 }

-var testCases = []struct {
+var testCasesConcat = []struct {
 	Hash  string
 	Parts []string
 }{{
@ -62,7 +144,7 @@ func TestConcat(t *testing.T) {
 		err            error
 	)

-	for _, tc := range testCases {
+	for _, tc := range testCasesConcat {
 		expect, err = hex.DecodeString(tc.Hash)
 		require.NoError(t, err)

@ -86,7 +168,7 @@ func TestValidate(t *testing.T) {
 		err  error
 	)

-	for _, tc := range testCases {
+	for _, tc := range testCasesConcat {
 		hash, _ = hex.DecodeString(tc.Hash)
 		require.NoError(t, err)

--- a/tz/tzbits_amd64.s
+++ b/tz/tzbits_amd64.s
@ -60,3 +60,30 @@ TEXT ·mulBitRight(SB),NOSPLIT,$0
    XORPD X9, X3
    MOVUPD X3, (DX)
    RET
+
+
+// func mulBitRightx2(c00c10, c01c11 *[4]uint64, e *[2]uint64)
+TEXT ·mulBitRightx2(SB),NOSPLIT,$0
+    MOVQ c00c10+0(FP), AX
+    VMOVDQA (AX), Y0
+    MOVQ c01c11+8(FP), BX
+    VMOVDQA (BX), Y8
+
+    VPSLLQ $1, Y0, Y1
+    VPALIGNR $8, Y1, Y0, Y2
+    VPSRLQ $63, Y2, Y2
+    VPXOR Y1, Y2, Y2
+    VPSRLQ $63, Y1, Y3
+    VPSLLQ $63, Y3, Y3
+    VPUNPCKHQDQ Y3, Y3, Y3
+    VPXOR Y2, Y3, Y3
+
+    MOVQ e+16(FP), CX
+    VBROADCASTI128 (CX), Y2
+
+    VPXOR Y3, Y8, Y3
+    VPAND Y3, Y2, Y4
+    VPXOR Y4, Y0, Y8
+    VMOVDQA Y8, (BX)
+    VMOVDQA Y3, (AX)
+    RET