From 3191f1b3fdf46a26c5e14f544b08a7c03a74e2cf Mon Sep 17 00:00:00 2001 From: Evgenii Stratonikov Date: Tue, 15 Oct 2019 14:37:59 +0300 Subject: [PATCH 1/4] Add AVX implementation with inlined multiplication Perform multiplication by-byte instead of by-bit as in AVX2Inline implementation. --- tz/avx_amd64.s | 91 ++++++++++++++++++++++++++++++++++-------------- tz/avx_inline.go | 65 ++++++++++++++++++++++++++++++++++ tz/hash.go | 5 +++ tz/hash_test.go | 4 +++ 4 files changed, 138 insertions(+), 27 deletions(-) create mode 100644 tz/avx_inline.go diff --git a/tz/avx_amd64.s b/tz/avx_amd64.s index 97e41b9..a380b3a 100644 --- a/tz/avx_amd64.s +++ b/tz/avx_amd64.s @@ -12,6 +12,30 @@ XORPD R2, TO \ XORPD R3, TO +#define mask(bit, src, tmp, to1, to2) \ + MOVQ src, tmp \ + SHRQ bit, tmp \ + ANDQ $1, tmp \ + NEGQ tmp \ + MOVQ tmp, to1 \ + VSHUFPS $0, to1, to1, to2 + // VPBROADCASTB to1, to2 + // Can't use VPBROADCASTB because it is AVX2 instruction + //https://software.intel.com/en-us/forums/intel-isa-extensions/topic/301461 + +#define mulBit(bit) \ + MOVUPD X0, X8 \ + MOVUPD X2, X9 \ + mul2(X0, X5, X6, X7) \ + VXORPD X1, X5, X0 \ + mul2(X2, X5, X6, X7) \ + VXORPD X3, X5, X2 \ + mask(bit, CX, DX, X6, X5) \ + VANDPD X0, X5, X1 \ + XORPD X8, X1 \ + VANDPD X2, X5, X3 \ + XORPD X9, X3 + // func mulBitRight(c00, c01, c10, c11, e *[2]uint64) TEXT ·mulBitRight(SB),NOSPLIT,$0 MOVQ c00+0(FP), AX @@ -25,40 +49,53 @@ TEXT ·mulBitRight(SB),NOSPLIT,$0 MOVQ c11+24(FP), DX MOVUPD (DX), X3 - // c00 *= 2 - mul2(X0, X5, X6, X7) - MOVUPD X5, X0 + mul2(X0, X5, X6, X7) // c00 *= 2 + VXORPD X5, X1, X0 // c00 += c01 + mul2(X2, X5, X6, X7) // c10 *= 2 + VXORPD X3, X5, X2 // c10 += c11 + MOVQ e+32(FP), CX + MOVUPD (CX), X5 + VANDPD X0, X5, X1 // c01 = c00 + e + XORPD X8, X1 // c01 += X8 (old c00) + VANDPD X2, X5, X3 // c11 = c10 + e + XORPD X9, X3 // c11 += x9 (old c10) - // c00 += c01 - XORPD X1, X0 MOVUPD X0, (AX) - - // c10 *= 2 - mul2(X2, X5, X6, X7) - MOVUPD X5, X2 - - // c10 += c11 - XORPD X3, X2 + MOVQ c10+16(FP), CX MOVUPD X2, (CX) - - MOVQ e+32(FP), AX - MOVUPD (AX), X5 - - // c01 = c00 + e - VANDPD X0, X5, X1 - - // c01 += X8 (old c00) - XORPD X8, X1 MOVUPD X1, (BX) - - // c11 = c10 + e - VANDPD X2, X5, X3 - - // c11 += X9 (old c10) - XORPD X9, X3 MOVUPD X3, (DX) + RET +TEXT ·mulByteRight(SB),NOSPLIT,$0 + MOVQ c00+0(FP), AX + MOVUPD (AX), X0 + MOVQ c01+8(FP), BX + MOVUPD (BX), X1 + MOVQ c10+16(FP), CX + MOVUPD (CX), X2 + MOVQ c11+24(FP), DX + MOVUPD (DX), X3 + MOVB b+32(FP), CX + + mulBit($7) + mulBit($6) + mulBit($5) + mulBit($4) + mulBit($3) + mulBit($2) + mulBit($1) + mulBit($0) + + MOVUPD X0, (AX) + MOVQ c10+16(FP), CX + MOVUPD X2, (CX) + MOVUPD X1, (BX) + MOVQ c11+24(FP), DX + MOVUPD X3, (DX) + + RET // func mulBitRightx2(c00c10, c01c11 *[4]uint64, e *[2]uint64) TEXT ·mulBitRightx2(SB),NOSPLIT,$0 diff --git a/tz/avx_inline.go b/tz/avx_inline.go new file mode 100644 index 0000000..387b256 --- /dev/null +++ b/tz/avx_inline.go @@ -0,0 +1,65 @@ +// Copyright 2018 (c) NSPCC +// +// This file contains AVX implementation. +package tz + +import ( + "hash" +) + +type digest4 struct { + x [4]GF127 +} + +// type assertion +var _ hash.Hash = (*digest4)(nil) + +func newAVXInline() *digest4 { + d := new(digest4) + d.Reset() + return d +} + +func (d *digest4) Sum(in []byte) []byte { + // Make a copy of d so that caller can keep writing and summing. + d0 := *d + h := d0.checkSum() + return append(in, h[:]...) +} + +func (d *digest4) checkSum() [hashSize]byte { + return d.byteArray() +} + +func (d *digest4) byteArray() (b [hashSize]byte) { + copy(b[:], d.x[0].ByteArray()) + copy(b[16:], d.x[1].ByteArray()) + copy(b[32:], d.x[2].ByteArray()) + copy(b[48:], d.x[3].ByteArray()) + return +} + +func (d *digest4) Reset() { + d.x[0] = GF127{1, 0} + d.x[1] = GF127{0, 0} + d.x[2] = GF127{0, 0} + d.x[3] = GF127{1, 0} +} + +func (d *digest4) Write(data []byte) (n int, err error) { + n = len(data) + for _, b := range data { + mulByteRight(&d.x[0], &d.x[1], &d.x[2], &d.x[3], b) + } + return +} + +func (d *digest4) Size() int { + return hashSize +} + +func (d *digest4) BlockSize() int { + return hashBlockSize +} + +func mulByteRight(c00, c01, c10, c11 *GF127, b byte) diff --git a/tz/hash.go b/tz/hash.go index 25eb2fa..f2cec1e 100644 --- a/tz/hash.go +++ b/tz/hash.go @@ -21,6 +21,7 @@ const ( AVX2 AVX2Inline PureGo + AVXInline ) var ( @@ -34,6 +35,8 @@ func (impl Implementation) String() string { switch impl { case AVX: return "AVX" + case AVXInline: + return "AVXInline" case AVX2: return "AVX2" case AVX2Inline: @@ -49,6 +52,8 @@ func NewWith(impl Implementation) hash.Hash { switch impl { case AVX: return newAVX() + case AVXInline: + return newAVXInline() case AVX2: return newAVX2() case AVX2Inline: diff --git a/tz/hash_test.go b/tz/hash_test.go index 6574f66..8dd7214 100644 --- a/tz/hash_test.go +++ b/tz/hash_test.go @@ -13,6 +13,7 @@ const benchDataSize = 100000 var providers = []Implementation{ AVX, + AVXInline, AVX2, AVX2Inline, PureGo, @@ -22,6 +23,9 @@ func TestNewWith(t *testing.T) { d := NewWith(AVX) require.IsType(t, (*digest)(nil), d) + d = NewWith(AVXInline) + require.IsType(t, (*digest4)(nil), d) + d = NewWith(AVX2) require.IsType(t, (*digest2)(nil), d) From 4b7f39cd1dcfaa75b62d0eb09087d89a7ddc9fe0 Mon Sep 17 00:00:00 2001 From: Evgenii Stratonikov Date: Wed, 16 Oct 2019 14:39:35 +0300 Subject: [PATCH 2/4] Move mulBitRightx2 to avx2 assembly file --- tz/{avx2_inline_amd64.s => avx2_amd64.s} | 26 ++++++++++++++++++++++++ tz/avx_amd64.s | 26 ------------------------ 2 files changed, 26 insertions(+), 26 deletions(-) rename tz/{avx2_inline_amd64.s => avx2_amd64.s} (62%) diff --git a/tz/avx2_inline_amd64.s b/tz/avx2_amd64.s similarity index 62% rename from tz/avx2_inline_amd64.s rename to tz/avx2_amd64.s index fb7f83a..200163c 100644 --- a/tz/avx2_inline_amd64.s +++ b/tz/avx2_amd64.s @@ -44,3 +44,29 @@ TEXT ·mulByteRightx2(SB),NOSPLIT,$0 VMOVDQA Y0, (AX) RET + +// func mulBitRightx2(c00c10, c01c11 *[4]uint64, e *[2]uint64) +TEXT ·mulBitRightx2(SB),NOSPLIT,$0 + MOVQ c00c10+0(FP), AX + VMOVDQA (AX), Y0 + MOVQ c01c11+8(FP), BX + VMOVDQA (BX), Y8 + + VPSLLQ $1, Y0, Y1 + VPALIGNR $8, Y1, Y0, Y2 + VPSRLQ $63, Y2, Y2 + VPXOR Y1, Y2, Y2 + VPSRLQ $63, Y1, Y3 + VPSLLQ $63, Y3, Y3 + VPUNPCKHQDQ Y3, Y3, Y3 + VPXOR Y2, Y3, Y3 + + MOVQ e+16(FP), CX + VBROADCASTI128 (CX), Y2 + + VPXOR Y3, Y8, Y3 + VPAND Y3, Y2, Y4 + VPXOR Y4, Y0, Y8 + VMOVDQA Y8, (BX) + VMOVDQA Y3, (AX) + RET diff --git a/tz/avx_amd64.s b/tz/avx_amd64.s index a380b3a..d8b5e53 100644 --- a/tz/avx_amd64.s +++ b/tz/avx_amd64.s @@ -96,29 +96,3 @@ TEXT ·mulByteRight(SB),NOSPLIT,$0 MOVUPD X3, (DX) RET - -// func mulBitRightx2(c00c10, c01c11 *[4]uint64, e *[2]uint64) -TEXT ·mulBitRightx2(SB),NOSPLIT,$0 - MOVQ c00c10+0(FP), AX - VMOVDQA (AX), Y0 - MOVQ c01c11+8(FP), BX - VMOVDQA (BX), Y8 - - VPSLLQ $1, Y0, Y1 - VPALIGNR $8, Y1, Y0, Y2 - VPSRLQ $63, Y2, Y2 - VPXOR Y1, Y2, Y2 - VPSRLQ $63, Y1, Y3 - VPSLLQ $63, Y3, Y3 - VPUNPCKHQDQ Y3, Y3, Y3 - VPXOR Y2, Y3, Y3 - - MOVQ e+16(FP), CX - VBROADCASTI128 (CX), Y2 - - VPXOR Y3, Y8, Y3 - VPAND Y3, Y2, Y4 - VPXOR Y4, Y0, Y8 - VMOVDQA Y8, (BX) - VMOVDQA Y3, (AX) - RET From 5f74bbc9793594ae8a4ba705e3acd46f9466cf76 Mon Sep 17 00:00:00 2001 From: Evgenii Stratonikov Date: Wed, 16 Oct 2019 14:50:31 +0300 Subject: [PATCH 3/4] Update benchmark result in README.md Also simplify test's and benchmark's names. --- README.md | 10 ++++++---- tz/hash_test.go | 4 ++-- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 97c0385..055fdaf 100644 --- a/README.md +++ b/README.md @@ -28,12 +28,14 @@ The example of how it works can be seen in tests. # Benchmarks -## AVX vs AVX2 version +## go vs AVX vs AVX2 version ``` -BenchmarkAVX-8 500 3492019 ns/op 28.64 MB/s 64 B/op 4 allocs/op -BenchmarkAVX2-8 500 2752693 ns/op 36.33 MB/s 64 B/op 2 allocs/op -BenchmarkAVX2Inline-8 1000 1877260 ns/op 53.27 MB/s 64 B/op 2 allocs/op +BenchmarkSum/AVX_digest-8 308 3889484 ns/op 25.71 MB/s 5 allocs/op +BenchmarkSum/AVXInline_digest-8 457 2455437 ns/op 40.73 MB/s 5 allocs/op +BenchmarkSum/AVX2_digest-8 399 3031102 ns/op 32.99 MB/s 3 allocs/op +BenchmarkSum/AVX2Inline_digest-8 602 2077719 ns/op 48.13 MB/s 3 allocs/op +BenchmarkSum/PureGo_digest-8 68 17795480 ns/op 5.62 MB/s 5 allocs/op ``` # Contributing diff --git a/tz/hash_test.go b/tz/hash_test.go index 8dd7214..43ce8a9 100644 --- a/tz/hash_test.go +++ b/tz/hash_test.go @@ -57,7 +57,7 @@ var testCases = []struct { func TestHash(t *testing.T) { for i := range providers { p := providers[i] - t.Run("test "+p.String()+" digest", func(t *testing.T) { + t.Run(p.String()+" digest", func(t *testing.T) { d := NewWith(p) for _, tc := range testCases { d.Reset() @@ -86,7 +86,7 @@ func BenchmarkSum(b *testing.B) { for i := range providers { p := providers[i] - b.Run("bench"+p.String()+"digest", func(b *testing.B) { + b.Run(p.String()+" digest", func(b *testing.B) { b.ResetTimer() b.ReportAllocs() d := NewWith(p) From a8357fda0e5f185c45f640b75a702d35a8dd433f Mon Sep 17 00:00:00 2001 From: Evgenii Stratonikov Date: Wed, 16 Oct 2019 14:52:16 +0300 Subject: [PATCH 4/4] Change default AVX implementation --- tz/hash.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tz/hash.go b/tz/hash.go index f2cec1e..fd43709 100644 --- a/tz/hash.go +++ b/tz/hash.go @@ -70,7 +70,7 @@ func New() hash.Hash { if hasAVX2 { return newAVX2Inline() } else if hasAVX { - return newAVX() + return newAVXInline() } else { return newPure() } @@ -83,7 +83,7 @@ func Sum(data []byte) [hashSize]byte { _, _ = d.Write(data) // no errors return d.checkSum() } else if hasAVX { - d := newAVX() + d := newAVXInline() _, _ = d.Write(data) // no errors return d.checkSum() } else {