From 3191f1b3fdf46a26c5e14f544b08a7c03a74e2cf Mon Sep 17 00:00:00 2001 From: Evgenii Stratonikov Date: Tue, 15 Oct 2019 14:37:59 +0300 Subject: [PATCH] Add AVX implementation with inlined multiplication Perform multiplication by-byte instead of by-bit as in AVX2Inline implementation. --- tz/avx_amd64.s | 91 ++++++++++++++++++++++++++++++++++-------------- tz/avx_inline.go | 65 ++++++++++++++++++++++++++++++++++ tz/hash.go | 5 +++ tz/hash_test.go | 4 +++ 4 files changed, 138 insertions(+), 27 deletions(-) create mode 100644 tz/avx_inline.go diff --git a/tz/avx_amd64.s b/tz/avx_amd64.s index 97e41b9..a380b3a 100644 --- a/tz/avx_amd64.s +++ b/tz/avx_amd64.s @@ -12,6 +12,30 @@ XORPD R2, TO \ XORPD R3, TO +#define mask(bit, src, tmp, to1, to2) \ + MOVQ src, tmp \ + SHRQ bit, tmp \ + ANDQ $1, tmp \ + NEGQ tmp \ + MOVQ tmp, to1 \ + VSHUFPS $0, to1, to1, to2 + // VPBROADCASTB to1, to2 + // Can't use VPBROADCASTB because it is AVX2 instruction + //https://software.intel.com/en-us/forums/intel-isa-extensions/topic/301461 + +#define mulBit(bit) \ + MOVUPD X0, X8 \ + MOVUPD X2, X9 \ + mul2(X0, X5, X6, X7) \ + VXORPD X1, X5, X0 \ + mul2(X2, X5, X6, X7) \ + VXORPD X3, X5, X2 \ + mask(bit, CX, DX, X6, X5) \ + VANDPD X0, X5, X1 \ + XORPD X8, X1 \ + VANDPD X2, X5, X3 \ + XORPD X9, X3 + // func mulBitRight(c00, c01, c10, c11, e *[2]uint64) TEXT ·mulBitRight(SB),NOSPLIT,$0 MOVQ c00+0(FP), AX @@ -25,40 +49,53 @@ TEXT ·mulBitRight(SB),NOSPLIT,$0 MOVQ c11+24(FP), DX MOVUPD (DX), X3 - // c00 *= 2 - mul2(X0, X5, X6, X7) - MOVUPD X5, X0 + mul2(X0, X5, X6, X7) // c00 *= 2 + VXORPD X5, X1, X0 // c00 += c01 + mul2(X2, X5, X6, X7) // c10 *= 2 + VXORPD X3, X5, X2 // c10 += c11 + MOVQ e+32(FP), CX + MOVUPD (CX), X5 + VANDPD X0, X5, X1 // c01 = c00 + e + XORPD X8, X1 // c01 += X8 (old c00) + VANDPD X2, X5, X3 // c11 = c10 + e + XORPD X9, X3 // c11 += x9 (old c10) - // c00 += c01 - XORPD X1, X0 MOVUPD X0, (AX) - - // c10 *= 2 - mul2(X2, X5, X6, X7) - MOVUPD X5, X2 - - // c10 += c11 - XORPD X3, X2 + MOVQ c10+16(FP), CX MOVUPD X2, (CX) - - MOVQ e+32(FP), AX - MOVUPD (AX), X5 - - // c01 = c00 + e - VANDPD X0, X5, X1 - - // c01 += X8 (old c00) - XORPD X8, X1 MOVUPD X1, (BX) - - // c11 = c10 + e - VANDPD X2, X5, X3 - - // c11 += X9 (old c10) - XORPD X9, X3 MOVUPD X3, (DX) + RET +TEXT ·mulByteRight(SB),NOSPLIT,$0 + MOVQ c00+0(FP), AX + MOVUPD (AX), X0 + MOVQ c01+8(FP), BX + MOVUPD (BX), X1 + MOVQ c10+16(FP), CX + MOVUPD (CX), X2 + MOVQ c11+24(FP), DX + MOVUPD (DX), X3 + MOVB b+32(FP), CX + + mulBit($7) + mulBit($6) + mulBit($5) + mulBit($4) + mulBit($3) + mulBit($2) + mulBit($1) + mulBit($0) + + MOVUPD X0, (AX) + MOVQ c10+16(FP), CX + MOVUPD X2, (CX) + MOVUPD X1, (BX) + MOVQ c11+24(FP), DX + MOVUPD X3, (DX) + + RET // func mulBitRightx2(c00c10, c01c11 *[4]uint64, e *[2]uint64) TEXT ·mulBitRightx2(SB),NOSPLIT,$0 diff --git a/tz/avx_inline.go b/tz/avx_inline.go new file mode 100644 index 0000000..387b256 --- /dev/null +++ b/tz/avx_inline.go @@ -0,0 +1,65 @@ +// Copyright 2018 (c) NSPCC +// +// This file contains AVX implementation. +package tz + +import ( + "hash" +) + +type digest4 struct { + x [4]GF127 +} + +// type assertion +var _ hash.Hash = (*digest4)(nil) + +func newAVXInline() *digest4 { + d := new(digest4) + d.Reset() + return d +} + +func (d *digest4) Sum(in []byte) []byte { + // Make a copy of d so that caller can keep writing and summing. + d0 := *d + h := d0.checkSum() + return append(in, h[:]...) +} + +func (d *digest4) checkSum() [hashSize]byte { + return d.byteArray() +} + +func (d *digest4) byteArray() (b [hashSize]byte) { + copy(b[:], d.x[0].ByteArray()) + copy(b[16:], d.x[1].ByteArray()) + copy(b[32:], d.x[2].ByteArray()) + copy(b[48:], d.x[3].ByteArray()) + return +} + +func (d *digest4) Reset() { + d.x[0] = GF127{1, 0} + d.x[1] = GF127{0, 0} + d.x[2] = GF127{0, 0} + d.x[3] = GF127{1, 0} +} + +func (d *digest4) Write(data []byte) (n int, err error) { + n = len(data) + for _, b := range data { + mulByteRight(&d.x[0], &d.x[1], &d.x[2], &d.x[3], b) + } + return +} + +func (d *digest4) Size() int { + return hashSize +} + +func (d *digest4) BlockSize() int { + return hashBlockSize +} + +func mulByteRight(c00, c01, c10, c11 *GF127, b byte) diff --git a/tz/hash.go b/tz/hash.go index 25eb2fa..f2cec1e 100644 --- a/tz/hash.go +++ b/tz/hash.go @@ -21,6 +21,7 @@ const ( AVX2 AVX2Inline PureGo + AVXInline ) var ( @@ -34,6 +35,8 @@ func (impl Implementation) String() string { switch impl { case AVX: return "AVX" + case AVXInline: + return "AVXInline" case AVX2: return "AVX2" case AVX2Inline: @@ -49,6 +52,8 @@ func NewWith(impl Implementation) hash.Hash { switch impl { case AVX: return newAVX() + case AVXInline: + return newAVXInline() case AVX2: return newAVX2() case AVX2Inline: diff --git a/tz/hash_test.go b/tz/hash_test.go index 6574f66..8dd7214 100644 --- a/tz/hash_test.go +++ b/tz/hash_test.go @@ -13,6 +13,7 @@ const benchDataSize = 100000 var providers = []Implementation{ AVX, + AVXInline, AVX2, AVX2Inline, PureGo, @@ -22,6 +23,9 @@ func TestNewWith(t *testing.T) { d := NewWith(AVX) require.IsType(t, (*digest)(nil), d) + d = NewWith(AVXInline) + require.IsType(t, (*digest4)(nil), d) + d = NewWith(AVX2) require.IsType(t, (*digest2)(nil), d)