diff --git a/tz/avx.go b/tz/avx.go index 4a61702..4b025b2 100644 --- a/tz/avx.go +++ b/tz/avx.go @@ -57,14 +57,15 @@ func (d *digest) Reset() { func (d *digest) Write(data []byte) (n int, err error) { n = len(data) for _, b := range data { - mulBitRight(&d.x[0], &d.x[1], &d.x[2], &d.x[3], &minmax[(b>>7)&1]) - mulBitRight(&d.x[0], &d.x[1], &d.x[2], &d.x[3], &minmax[(b>>6)&1]) - mulBitRight(&d.x[0], &d.x[1], &d.x[2], &d.x[3], &minmax[(b>>5)&1]) - mulBitRight(&d.x[0], &d.x[1], &d.x[2], &d.x[3], &minmax[(b>>4)&1]) - mulBitRight(&d.x[0], &d.x[1], &d.x[2], &d.x[3], &minmax[(b>>3)&1]) - mulBitRight(&d.x[0], &d.x[1], &d.x[2], &d.x[3], &minmax[(b>>2)&1]) - mulBitRight(&d.x[0], &d.x[1], &d.x[2], &d.x[3], &minmax[(b>>1)&1]) - mulBitRight(&d.x[0], &d.x[1], &d.x[2], &d.x[3], &minmax[(b>>0)&1]) + mulByteRight(&d.x[0], &d.x[1], &d.x[2], &d.x[3], b) + // mulBitRight(&d.x[0], &d.x[1], &d.x[2], &d.x[3], &minmax[(b>>7)&1]) + // mulBitRight(&d.x[0], &d.x[1], &d.x[2], &d.x[3], &minmax[(b>>6)&1]) + // mulBitRight(&d.x[0], &d.x[1], &d.x[2], &d.x[3], &minmax[(b>>5)&1]) + // mulBitRight(&d.x[0], &d.x[1], &d.x[2], &d.x[3], &minmax[(b>>4)&1]) + // mulBitRight(&d.x[0], &d.x[1], &d.x[2], &d.x[3], &minmax[(b>>3)&1]) + // mulBitRight(&d.x[0], &d.x[1], &d.x[2], &d.x[3], &minmax[(b>>2)&1]) + // mulBitRight(&d.x[0], &d.x[1], &d.x[2], &d.x[3], &minmax[(b>>1)&1]) + // mulBitRight(&d.x[0], &d.x[1], &d.x[2], &d.x[3], &minmax[(b>>0)&1]) } return } @@ -78,3 +79,5 @@ func (d *digest) BlockSize() int { } func mulBitRight(c00, c01, c10, c11, e *gf127.GF127) + +func mulByteRight(c00, c01, c10, c11 *gf127.GF127, b byte) diff --git a/tz/avx_amd64.s b/tz/avx_amd64.s index 97e41b9..deb8ec1 100644 --- a/tz/avx_amd64.s +++ b/tz/avx_amd64.s @@ -12,6 +12,62 @@ XORPD R2, TO \ XORPD R3, TO + +#define mask(bit, src, tmp, to1, to2) \ + MOVQ src, tmp \ + SHRQ bit, tmp \ + ANDQ $1, tmp \ + NEGQ tmp \ + MOVQ tmp, to1 \ + VPBROADCASTB to1, to2 + // FIXME VPBROADCASTB is AVX2 instruction + //https://software.intel.com/en-us/forums/intel-isa-extensions/topic/301461 + +#define mulBit(bit) \ + MOVUPD X0, X8 \ + MOVUPD X2, X9 \ + mul2(X0, X5, X6, X7) \ + VXORPD X1, X5, X0 \ + mul2(X2, X5, X6, X7) \ + VXORPD X3, X5, X2 \ + mask(bit, CX, DX, X6, X5) \ + VANDPD X0, X5, X1 \ + XORPD X8, X1 \ + VANDPD X2, X5, X3 \ + XORPD X9, X3 + + +// func mulBitRight(c00, c01, c10, c11 *[2]uint64, byte) +TEXT ·mulByteRight(SB),NOSPLIT,$0 + MOVQ c00+0(FP), AX + MOVUPD (AX), X0 + MOVQ c01+8(FP), BX + MOVUPD (BX), X1 + MOVQ c10+16(FP), CX + MOVUPD (CX), X2 + MOVQ c11+24(FP), DX + MOVUPD (DX), X3 + MOVB e+32(FP), CX + + mulBit($7) + mulBit($6) + mulBit($5) + mulBit($4) + mulBit($3) + mulBit($2) + mulBit($1) + mulBit($0) + + MOVUPD X0, (AX) + MOVQ c10+16(FP), CX + MOVUPD X2, (CX) + MOVUPD X1, (BX) + MOVQ c11+24(FP), DX + MOVUPD X3, (DX) + + RET + + // func mulBitRight(c00, c01, c10, c11, e *[2]uint64) TEXT ·mulBitRight(SB),NOSPLIT,$0 MOVQ c00+0(FP), AX @@ -25,40 +81,24 @@ TEXT ·mulBitRight(SB),NOSPLIT,$0 MOVQ c11+24(FP), DX MOVUPD (DX), X3 - // c00 *= 2 mul2(X0, X5, X6, X7) - MOVUPD X5, X0 - - // c00 += c01 - XORPD X1, X0 - MOVUPD X0, (AX) - - // c10 *= 2 + VXORPD X1, X5, X0 mul2(X2, X5, X6, X7) - MOVUPD X5, X2 - - // c10 += c11 - XORPD X3, X2 - MOVUPD X2, (CX) - - MOVQ e+32(FP), AX - MOVUPD (AX), X5 - - // c01 = c00 + e + VXORPD X3, X5, X2 + MOVQ e+32(FP), CX + MOVUPD (CX), X5 VANDPD X0, X5, X1 - - // c01 += X8 (old c00) XORPD X8, X1 - MOVUPD X1, (BX) - - // c11 = c10 + e VANDPD X2, X5, X3 - - // c11 += X9 (old c10) XORPD X9, X3 - MOVUPD X3, (DX) - RET + MOVUPD X0, (AX) + MOVQ c10+16(FP), CX + MOVUPD X2, (CX) + MOVUPD X1, (BX) + MOVUPD X3, (DX) + + RET // func mulBitRightx2(c00c10, c01c11 *[4]uint64, e *[2]uint64) TEXT ·mulBitRightx2(SB),NOSPLIT,$0