From 0e0d28e82f286fd7632299f1ff2f0afd04a1f3de Mon Sep 17 00:00:00 2001
From: Evgenii Stratonikov <evgeniy@nspcc.ru>
Date: Wed, 9 Mar 2022 16:58:21 +0300
Subject: [PATCH] tz: use build tags for different implemenations

Signed-off-by: Evgenii Stratonikov <evgeniy@nspcc.ru>
---
 benchmark                                |   7 +-
 cmd/tzsum/main.go                        |  22 ++--
 tz/avx.go                                |  75 --------------
 tz/avx2.go                               |  62 ------------
 tz/avx2_inline.go                        |  55 ----------
 tz/avx_inline.go                         |  62 ------------
 tz/digest.go                             | 122 +++++++++++++++++++++++
 tz/{avx2_amd64.s => digest_avx2_amd64.s} |  26 -----
 tz/{avx_amd64.s => digest_avx_amd64.s}   |  45 +--------
 tz/digest_generic.go                     |   8 ++
 tz/digets_amd64.go                       |  39 ++++++++
 tz/hash.go                               |  87 ----------------
 tz/hash_test.go                          |  69 +++++++------
 tz/pure.go                               |  92 -----------------
 tz/sl2.go                                |  11 +-
 15 files changed, 236 insertions(+), 546 deletions(-)
 delete mode 100644 tz/avx.go
 delete mode 100644 tz/avx2.go
 delete mode 100644 tz/avx2_inline.go
 delete mode 100644 tz/avx_inline.go
 create mode 100644 tz/digest.go
 rename tz/{avx2_amd64.s => digest_avx2_amd64.s} (74%)
 rename tz/{avx_amd64.s => digest_avx_amd64.s} (60%)
 create mode 100644 tz/digest_generic.go
 create mode 100644 tz/digets_amd64.go
 delete mode 100644 tz/pure.go

diff --git a/benchmark b/benchmark
index 926c02a..6d16439 100755
--- a/benchmark
+++ b/benchmark
@@ -3,9 +3,10 @@
 tmpfile=$(mktemp /tmp/random-file.XXXXXX)
 dd if=/dev/urandom of=$tmpfile bs=$1 count=1
 
-go build ./cmd/tzsum && \
-for impl in avx avx2 avx2inline purego; do
+go build ./cmd/tzsum || exit 1
+
+for impl in avx avx2 generic; do
 	echo $impl implementation:
-	/usr/bin/env time ./tzsum -name $tmpfile -impl $impl
+	time ./tzsum -name $tmpfile -impl $impl
 	echo
 done
diff --git a/cmd/tzsum/main.go b/cmd/tzsum/main.go
index 5d6565e..1707e74 100644
--- a/cmd/tzsum/main.go
+++ b/cmd/tzsum/main.go
@@ -11,6 +11,7 @@ import (
 	"runtime/pprof"
 
 	"github.com/nspcc-dev/tzhash/tz"
+	"golang.org/x/sys/cpu"
 )
 
 var (
@@ -46,18 +47,23 @@ func main() {
 		f = os.Stdin
 	}
 
+	// Override CPU feature flags to make sure a proper backend is used.
 	var h hash.Hash
 	switch *hashimpl {
 	case "avx":
-		h = tz.NewWith(tz.AVX)
-	case "avx2":
-		h = tz.NewWith(tz.AVX2)
-	case "avx2inline":
-		h = tz.NewWith(tz.AVX2Inline)
-	case "purego":
-		h = tz.NewWith(tz.PureGo)
-	default:
+		cpu.X86.HasAVX = true
+		cpu.X86.HasAVX2 = false
 		h = tz.New()
+	case "avx2":
+		cpu.X86.HasAVX = true
+		cpu.X86.HasAVX2 = true
+		h = tz.New()
+	case "generic":
+		cpu.X86.HasAVX = false
+		cpu.X86.HasAVX2 = false
+		h = tz.New()
+	default:
+		log.Fatalf("Invalid backend: %s", *hashimpl)
 	}
 
 	if _, err := io.Copy(h, f); err != nil {
diff --git a/tz/avx.go b/tz/avx.go
deleted file mode 100644
index 4e0a2fa..0000000
--- a/tz/avx.go
+++ /dev/null
@@ -1,75 +0,0 @@
-package tz
-
-import (
-	"hash"
-	"math"
-)
-
-type digest struct {
-	x [4]GF127
-}
-
-// type assertion
-var _ hash.Hash = (*digest)(nil)
-
-var (
-	minmax  = [2]GF127{{0, 0}, {math.MaxUint64, math.MaxUint64}}
-	x127x63 = GF127{1 << 63, 1 << 63} //nolint:deadcode,varcheck
-)
-
-func newAVX() *digest {
-	d := new(digest)
-	d.Reset()
-	return d
-}
-
-func (d *digest) Sum(in []byte) []byte {
-	// Make a copy of d so that caller can keep writing and summing.
-	d0 := *d
-	h := d0.checkSum()
-	return append(in, h[:]...)
-}
-
-func (d *digest) checkSum() [Size]byte {
-	return d.byteArray()
-}
-
-func (d *digest) byteArray() (b [Size]byte) {
-	copy(b[:], d.x[0].ByteArray())
-	copy(b[16:], d.x[1].ByteArray())
-	copy(b[32:], d.x[2].ByteArray())
-	copy(b[48:], d.x[3].ByteArray())
-	return
-}
-
-func (d *digest) Reset() {
-	d.x[0] = GF127{1, 0}
-	d.x[1] = GF127{0, 0}
-	d.x[2] = GF127{0, 0}
-	d.x[3] = GF127{1, 0}
-}
-
-func (d *digest) Write(data []byte) (n int, err error) {
-	n = len(data)
-	for _, b := range data {
-		mulBitRight(&d.x[0], &d.x[1], &d.x[2], &d.x[3], &minmax[(b>>7)&1])
-		mulBitRight(&d.x[0], &d.x[1], &d.x[2], &d.x[3], &minmax[(b>>6)&1])
-		mulBitRight(&d.x[0], &d.x[1], &d.x[2], &d.x[3], &minmax[(b>>5)&1])
-		mulBitRight(&d.x[0], &d.x[1], &d.x[2], &d.x[3], &minmax[(b>>4)&1])
-		mulBitRight(&d.x[0], &d.x[1], &d.x[2], &d.x[3], &minmax[(b>>3)&1])
-		mulBitRight(&d.x[0], &d.x[1], &d.x[2], &d.x[3], &minmax[(b>>2)&1])
-		mulBitRight(&d.x[0], &d.x[1], &d.x[2], &d.x[3], &minmax[(b>>1)&1])
-		mulBitRight(&d.x[0], &d.x[1], &d.x[2], &d.x[3], &minmax[(b>>0)&1])
-	}
-	return
-}
-
-func (d *digest) Size() int {
-	return Size
-}
-
-func (d *digest) BlockSize() int {
-	return hashBlockSize
-}
-
-func mulBitRight(c00, c01, c10, c11, e *GF127)
diff --git a/tz/avx2.go b/tz/avx2.go
deleted file mode 100644
index b41c182..0000000
--- a/tz/avx2.go
+++ /dev/null
@@ -1,62 +0,0 @@
-package tz
-
-import (
-	"hash"
-
-	"github.com/nspcc-dev/tzhash/gf127"
-)
-
-type digest2 struct {
-	x [2]gf127.GF127x2
-}
-
-// type assertion
-var _ hash.Hash = (*digest2)(nil)
-
-func newAVX2() *digest2 {
-	d := new(digest2)
-	d.Reset()
-	return d
-}
-
-func (d *digest2) Write(data []byte) (n int, err error) {
-	n = len(data)
-	for _, b := range data {
-		mulBitRightx2(&d.x[0], &d.x[1], &minmax[(b>>7)&1])
-		mulBitRightx2(&d.x[0], &d.x[1], &minmax[(b>>6)&1])
-		mulBitRightx2(&d.x[0], &d.x[1], &minmax[(b>>5)&1])
-		mulBitRightx2(&d.x[0], &d.x[1], &minmax[(b>>4)&1])
-		mulBitRightx2(&d.x[0], &d.x[1], &minmax[(b>>3)&1])
-		mulBitRightx2(&d.x[0], &d.x[1], &minmax[(b>>2)&1])
-		mulBitRightx2(&d.x[0], &d.x[1], &minmax[(b>>1)&1])
-		mulBitRightx2(&d.x[0], &d.x[1], &minmax[(b>>0)&1])
-	}
-	return
-}
-
-func (d *digest2) Sum(in []byte) []byte {
-	// Make a copy of d so that caller can keep writing and summing.
-	d0 := *d
-	h := d0.checkSum()
-	return append(in, h[:]...)
-}
-func (d *digest2) Reset() {
-	d.x[0] = gf127.GF127x2{GF127{1, 0}, GF127{0, 0}}
-	d.x[1] = gf127.GF127x2{GF127{0, 0}, GF127{1, 0}}
-}
-func (d *digest2) Size() int      { return Size }
-func (d *digest2) BlockSize() int { return hashBlockSize }
-func (d *digest2) checkSum() (b [Size]byte) {
-	// Matrix is stored transposed,
-	// but we need to use order consistent with digest.
-	h := d.x[0].ByteArray()
-	copy(b[:], h[:16])
-	copy(b[32:], h[16:])
-
-	h = d.x[1].ByteArray()
-	copy(b[16:], h[:16])
-	copy(b[48:], h[16:])
-	return
-}
-
-func mulBitRightx2(c00c10 *gf127.GF127x2, c01c11 *gf127.GF127x2, e *GF127)
diff --git a/tz/avx2_inline.go b/tz/avx2_inline.go
deleted file mode 100644
index f5fcecf..0000000
--- a/tz/avx2_inline.go
+++ /dev/null
@@ -1,55 +0,0 @@
-package tz
-
-import (
-	"hash"
-
-	"github.com/nspcc-dev/tzhash/gf127"
-)
-
-type digest3 struct {
-	x [2]gf127.GF127x2
-}
-
-// type assertion
-var _ hash.Hash = (*digest3)(nil)
-
-func newAVX2Inline() *digest3 {
-	d := new(digest3)
-	d.Reset()
-	return d
-}
-
-func (d *digest3) Write(data []byte) (n int, err error) {
-	n = len(data)
-	if len(data) != 0 {
-		mulByteSliceRightx2(&d.x[0], &d.x[1], n, &data[0])
-	}
-	return
-}
-
-func (d *digest3) Sum(in []byte) []byte {
-	// Make a copy of d so that caller can keep writing and summing.
-	d0 := *d
-	h := d0.checkSum()
-	return append(in, h[:]...)
-}
-func (d *digest3) Reset() {
-	d.x[0] = gf127.GF127x2{GF127{1, 0}, GF127{0, 0}}
-	d.x[1] = gf127.GF127x2{GF127{0, 0}, GF127{1, 0}}
-}
-func (d *digest3) Size() int      { return Size }
-func (d *digest3) BlockSize() int { return hashBlockSize }
-func (d *digest3) checkSum() (b [Size]byte) {
-	// Matrix is stored transposed,
-	// but we need to use order consistent with digest.
-	h := d.x[0].ByteArray()
-	copy(b[:], h[:16])
-	copy(b[32:], h[16:])
-
-	h = d.x[1].ByteArray()
-	copy(b[16:], h[:16])
-	copy(b[48:], h[16:])
-	return
-}
-
-func mulByteSliceRightx2(c00c10 *gf127.GF127x2, c01c11 *gf127.GF127x2, n int, data *byte)
diff --git a/tz/avx_inline.go b/tz/avx_inline.go
deleted file mode 100644
index a023be5..0000000
--- a/tz/avx_inline.go
+++ /dev/null
@@ -1,62 +0,0 @@
-package tz
-
-import (
-	"hash"
-)
-
-type digest4 struct {
-	x [4]GF127
-}
-
-// type assertion
-var _ hash.Hash = (*digest4)(nil)
-
-func newAVXInline() *digest4 {
-	d := new(digest4)
-	d.Reset()
-	return d
-}
-
-func (d *digest4) Sum(in []byte) []byte {
-	// Make a copy of d so that caller can keep writing and summing.
-	d0 := *d
-	h := d0.checkSum()
-	return append(in, h[:]...)
-}
-
-func (d *digest4) checkSum() [Size]byte {
-	return d.byteArray()
-}
-
-func (d *digest4) byteArray() (b [Size]byte) {
-	copy(b[:], d.x[0].ByteArray())
-	copy(b[16:], d.x[1].ByteArray())
-	copy(b[32:], d.x[2].ByteArray())
-	copy(b[48:], d.x[3].ByteArray())
-	return
-}
-
-func (d *digest4) Reset() {
-	d.x[0] = GF127{1, 0}
-	d.x[1] = GF127{0, 0}
-	d.x[2] = GF127{0, 0}
-	d.x[3] = GF127{1, 0}
-}
-
-func (d *digest4) Write(data []byte) (n int, err error) {
-	n = len(data)
-	for _, b := range data {
-		mulByteRight(&d.x[0], &d.x[1], &d.x[2], &d.x[3], b)
-	}
-	return
-}
-
-func (d *digest4) Size() int {
-	return Size
-}
-
-func (d *digest4) BlockSize() int {
-	return hashBlockSize
-}
-
-func mulByteRight(c00, c01, c10, c11 *GF127, b byte)
diff --git a/tz/digest.go b/tz/digest.go
new file mode 100644
index 0000000..7ba074d
--- /dev/null
+++ b/tz/digest.go
@@ -0,0 +1,122 @@
+package tz
+
+import (
+	"github.com/nspcc-dev/tzhash/gf127"
+)
+
+const (
+	// Size is the size of a Tillich-Zémor hash sum in bytes.
+	Size          = 64
+	hashBlockSize = 128
+)
+
+type digest struct {
+	// Stores matrix cells in the following order:
+	// [ 0 2 ]
+	// [ 1 3 ]
+	// This is done to reuse the same digest between generic
+	// and AVX2 implementation.
+	x [4]GF127
+}
+
+// New returns a new hash.Hash computing the Tillich-Zémor checksum.
+func New() *digest {
+	d := new(digest)
+	d.Reset()
+	return d
+}
+
+// Sum returns Tillich-Zémor checksum of data.
+func Sum(data []byte) [Size]byte {
+	d := new(digest)
+	_, _ = d.Write(data) // no errors
+	return d.checkSum()
+}
+
+func (d *digest) Sum(in []byte) []byte {
+	// Make a copy of d so that caller can keep writing and summing.
+	d0 := *d
+	h := d0.checkSum()
+	return append(in, h[:]...)
+}
+
+func (d *digest) checkSum() [Size]byte {
+	return d.byteArray()
+}
+
+func (d *digest) byteArray() (b [Size]byte) {
+	t := d.x[0].ByteArray()
+	copy(b[:], t[:])
+
+	t = d.x[2].ByteArray()
+	copy(b[16:], t[:])
+
+	t = d.x[1].ByteArray()
+	copy(b[32:], t[:])
+
+	t = d.x[3].ByteArray()
+	copy(b[48:], t[:])
+
+	return
+}
+
+func (d *digest) Reset() {
+	d.x[0] = GF127{1, 0}
+	d.x[1] = GF127{0, 0}
+	d.x[2] = GF127{0, 0}
+	d.x[3] = GF127{1, 0}
+}
+
+func (d *digest) Write(data []byte) (n int, err error) {
+	return write(d, data)
+}
+
+func writeGeneric(d *digest, data []byte) (n int, err error) {
+	n = len(data)
+	tmp := new(GF127)
+	for _, b := range data {
+		mulBitRightGeneric(&d.x[0], &d.x[1], &d.x[2], &d.x[3], b&0x80 != 0, tmp)
+		mulBitRightGeneric(&d.x[0], &d.x[1], &d.x[2], &d.x[3], b&0x40 != 0, tmp)
+		mulBitRightGeneric(&d.x[0], &d.x[1], &d.x[2], &d.x[3], b&0x20 != 0, tmp)
+		mulBitRightGeneric(&d.x[0], &d.x[1], &d.x[2], &d.x[3], b&0x10 != 0, tmp)
+		mulBitRightGeneric(&d.x[0], &d.x[1], &d.x[2], &d.x[3], b&0x08 != 0, tmp)
+		mulBitRightGeneric(&d.x[0], &d.x[1], &d.x[2], &d.x[3], b&0x04 != 0, tmp)
+		mulBitRightGeneric(&d.x[0], &d.x[1], &d.x[2], &d.x[3], b&0x02 != 0, tmp)
+		mulBitRightGeneric(&d.x[0], &d.x[1], &d.x[2], &d.x[3], b&0x01 != 0, tmp)
+	}
+	return
+}
+
+func (d *digest) Size() int {
+	return Size
+}
+
+func (d *digest) BlockSize() int {
+	return hashBlockSize
+}
+
+func mulBitRightGeneric(c00, c10, c01, c11 *GF127, bit bool, tmp *GF127) {
+	if bit {
+		*tmp = *c00
+		gf127.Mul10(c00, c00)
+		gf127.Add(c00, c01, c00)
+		gf127.Mul11(tmp, tmp)
+		gf127.Add(c01, tmp, c01)
+
+		*tmp = *c10
+		gf127.Mul10(c10, c10)
+		gf127.Add(c10, c11, c10)
+		gf127.Mul11(tmp, tmp)
+		gf127.Add(c11, tmp, c11)
+	} else {
+		*tmp = *c00
+		gf127.Mul10(c00, c00)
+		gf127.Add(c00, c01, c00)
+		*c01 = *tmp
+
+		*tmp = *c10
+		gf127.Mul10(c10, c10)
+		gf127.Add(c10, c11, c10)
+		*c11 = *tmp
+	}
+}
diff --git a/tz/avx2_amd64.s b/tz/digest_avx2_amd64.s
similarity index 74%
rename from tz/avx2_amd64.s
rename to tz/digest_avx2_amd64.s
index 0a0de51..2d818de 100644
--- a/tz/avx2_amd64.s
+++ b/tz/digest_avx2_amd64.s
@@ -59,29 +59,3 @@ finish:
 	VMOVDQU Y0, (AX)
 
 	RET
-
-// func mulBitRightx2(c00c10, c01c11 *[4]uint64, e *[2]uint64)
-TEXT ·mulBitRightx2(SB), NOSPLIT, $0
-	MOVQ    c00c10+0(FP), AX
-	VMOVDQU (AX), Y0
-	MOVQ    c01c11+8(FP), BX
-	VMOVDQU (BX), Y8
-
-	VPSLLQ      $1, Y0, Y1
-	VPALIGNR    $8, Y1, Y0, Y2
-	VPSRLQ      $63, Y2, Y2
-	VPXOR       Y1, Y2, Y2
-	VPSRLQ      $63, Y1, Y3
-	VPSLLQ      $63, Y3, Y3
-	VPUNPCKHQDQ Y3, Y3, Y3
-	VPXOR       Y2, Y3, Y3
-
-	MOVQ           e+16(FP), CX
-	VBROADCASTI128 (CX), Y2
-
-	VPXOR   Y3, Y8, Y3
-	VPAND   Y3, Y2, Y4
-	VPXOR   Y4, Y0, Y8
-	VMOVDQU Y8, (BX)
-	VMOVDQU Y3, (AX)
-	RET
diff --git a/tz/avx_amd64.s b/tz/digest_avx_amd64.s
similarity index 60%
rename from tz/avx_amd64.s
rename to tz/digest_avx_amd64.s
index 7a56a5c..e040ffd 100644
--- a/tz/avx_amd64.s
+++ b/tz/digest_avx_amd64.s
@@ -29,50 +29,13 @@
 	VANDPD  X2, X5, X3   \
 	VXORPD  X9, X3, X3
 
-// func mulBitRight(c00, c01, c10, c11, e *[2]uint64)
-TEXT ·mulBitRight(SB), NOSPLIT, $0
-	MOVQ    c00+0(FP), AX
-	VMOVDQU (AX), X0
-	VMOVDQU X0, X8         // remember c00 value
-	MOVQ    c01+8(FP), BX
-	VMOVDQU (BX), X1
-	MOVQ    c10+16(FP), CX
-	VMOVDQU (CX), X2
-	VMOVDQU X2, X9         // remember c10 value
-	MOVQ    c11+24(FP), DX
-	VMOVDQU (DX), X3
-
-	VPXOR    X13, X13, X13 // Y13 = 0x0000...
-	VPCMPEQB X14, X14, X14 // Y14 = 0xFFFF...
-	VPSUBQ   X14, X13, X13
-	VPSLLQ   $63, X13, X14
-
-	mul2(X0, X5, X6, X7) // c00 *= 2
-	VXORPD  X5, X1, X0   // c00 += c01
-	mul2(X2, X5, X6, X7) // c10 *= 2
-	VXORPD  X3, X5, X2   // c10 += c11
-	MOVQ    e+32(FP), CX
-	VMOVDQU (CX), X5
-	VANDPD  X0, X5, X1   // c01 = c00 + e
-	VXORPD  X8, X1, X1   // c01 += X8 (old c00)
-	VANDPD  X2, X5, X3   // c11 = c10 + e
-	VXORPD  X9, X3, X3   // c11 += x9 (old c10)
-
-	VMOVDQU X0, (AX)
-	MOVQ    c10+16(FP), CX
-	VMOVDQU X2, (CX)
-	VMOVDQU X1, (BX)
-	VMOVDQU X3, (DX)
-
-	RET
-
 TEXT ·mulByteRight(SB), NOSPLIT, $0
 	MOVQ    c00+0(FP), AX
 	VMOVDQU (AX), X0
-	MOVQ    c01+8(FP), BX
-	VMOVDQU (BX), X1
-	MOVQ    c10+16(FP), CX
+	MOVQ    c10+8(FP), CX
 	VMOVDQU (CX), X2
+	MOVQ    c01+16(FP), BX
+	VMOVDQU (BX), X1
 	MOVQ    c11+24(FP), DX
 	VMOVDQU (DX), X3
 	MOVQ    $0, CX
@@ -98,7 +61,7 @@ TEXT ·mulByteRight(SB), NOSPLIT, $0
 	mulBit($0)
 
 	VMOVDQU X0, (AX)
-	MOVQ    c10+16(FP), CX
+	MOVQ    c10+8(FP), CX
 	VMOVDQU X2, (CX)
 	VMOVDQU X1, (BX)
 	MOVQ    c11+24(FP), DX
diff --git a/tz/digest_generic.go b/tz/digest_generic.go
new file mode 100644
index 0000000..b8778c2
--- /dev/null
+++ b/tz/digest_generic.go
@@ -0,0 +1,8 @@
+//go:build !(amd64 && !generic)
+// +build !amd64 generic
+
+package tz
+
+func write(d *digest, data []byte) (int, error) {
+	return writeGeneric(d, data)
+}
diff --git a/tz/digets_amd64.go b/tz/digets_amd64.go
new file mode 100644
index 0000000..462bfff
--- /dev/null
+++ b/tz/digets_amd64.go
@@ -0,0 +1,39 @@
+//go:build amd64 && !generic
+// +build amd64,!generic
+
+package tz
+
+import (
+	"github.com/nspcc-dev/tzhash/gf127"
+	"golang.org/x/sys/cpu"
+)
+
+func write(d *digest, data []byte) (n int, err error) {
+	switch {
+	case cpu.X86.HasAVX && cpu.X86.HasAVX2:
+		return writeAVX2(d, data)
+	case cpu.X86.HasAVX:
+		return writeAVX(d, data)
+	default:
+		return writeGeneric(d, data)
+	}
+}
+
+func writeAVX2(d *digest, data []byte) (n int, err error) {
+	n = len(data)
+	if len(data) != 0 {
+		mulByteSliceRightx2(&d.x[0], &d.x[2], n, &data[0])
+	}
+	return
+}
+
+func writeAVX(d *digest, data []byte) (n int, err error) {
+	n = len(data)
+	for _, b := range data {
+		mulByteRight(&d.x[0], &d.x[1], &d.x[2], &d.x[3], b)
+	}
+	return
+}
+
+func mulByteRight(c00, c01, c10, c11 *GF127, b byte)
+func mulByteSliceRightx2(c00c10 *gf127.GF127, c01c11 *gf127.GF127, n int, data *byte)
diff --git a/tz/hash.go b/tz/hash.go
index a7e4e4e..e39ae60 100644
--- a/tz/hash.go
+++ b/tz/hash.go
@@ -6,95 +6,8 @@ package tz
 
 import (
 	"errors"
-	"hash"
-
-	"golang.org/x/sys/cpu"
 )
 
-type Implementation int
-
-const (
-	// Size is the size of a Tillich-Zemor hash sum in bytes.
-	Size          = 64
-	hashBlockSize = 128
-
-	_ Implementation = iota
-	AVX
-	AVX2
-	AVX2Inline
-	PureGo
-	AVXInline
-)
-
-var (
-	hasAVX = cpu.X86.HasAVX
-	// Having AVX2 does not guarantee
-	// that AVX is also present.
-	hasAVX2 = cpu.X86.HasAVX2 && hasAVX
-)
-
-func (impl Implementation) String() string {
-	switch impl {
-	case AVX:
-		return "AVX"
-	case AVXInline:
-		return "AVXInline"
-	case AVX2:
-		return "AVX2"
-	case AVX2Inline:
-		return "AVX2Inline"
-	case PureGo:
-		return "PureGo"
-	default:
-		return "UNKNOWN"
-	}
-}
-
-func NewWith(impl Implementation) hash.Hash {
-	switch impl {
-	case AVX:
-		return newAVX()
-	case AVXInline:
-		return newAVXInline()
-	case AVX2:
-		return newAVX2()
-	case AVX2Inline:
-		return newAVX2Inline()
-	case PureGo:
-		return newPure()
-	default:
-		return New()
-	}
-}
-
-// New returns a new hash.Hash computing the Tillich-Zémor checksum.
-func New() hash.Hash {
-	if hasAVX2 {
-		return newAVX2Inline()
-	} else if hasAVX {
-		return newAVXInline()
-	} else {
-		return newPure()
-	}
-}
-
-// Sum returns Tillich-Zémor checksum of data.
-func Sum(data []byte) [Size]byte {
-	if hasAVX2 {
-		d := newAVX2Inline()
-		_, _ = d.Write(data) // no errors
-		return d.checkSum()
-	} else if hasAVX {
-		d := newAVXInline()
-		_, _ = d.Write(data) // no errors
-		return d.checkSum()
-	} else {
-		d := newPure()
-		_, _ = d.Write(data) // no errors
-		return d.checkSum()
-	}
-}
-
 // Concat performs combining of hashes based on homomorphic property.
 func Concat(hs [][]byte) ([]byte, error) {
 	var b, c sl2
diff --git a/tz/hash_test.go b/tz/hash_test.go
index c0c10a4..539b8f1 100644
--- a/tz/hash_test.go
+++ b/tz/hash_test.go
@@ -2,38 +2,29 @@ package tz
 
 import (
 	"encoding/hex"
+	"fmt"
 	"io"
 	"math/rand"
 	"testing"
 
 	"github.com/stretchr/testify/require"
+	"golang.org/x/sys/cpu"
 )
 
 const benchDataSize = 100000
 
-var providers = []Implementation{
-	AVX,
-	AVXInline,
-	AVX2,
-	AVX2Inline,
-	PureGo,
+type arch struct {
+	HasAVX  bool
+	HasAVX2 bool
 }
 
-func TestNewWith(t *testing.T) {
-	d := NewWith(AVX)
-	require.IsType(t, (*digest)(nil), d)
-
-	d = NewWith(AVXInline)
-	require.IsType(t, (*digest4)(nil), d)
-
-	d = NewWith(AVX2)
-	require.IsType(t, (*digest2)(nil), d)
-
-	d = NewWith(AVX2Inline)
-	require.IsType(t, (*digest3)(nil), d)
-
-	d = NewWith(PureGo)
-	require.IsType(t, (*digestp)(nil), d)
+var backends = []struct {
+	Name string
+	arch
+}{
+	{"AVX", arch{true, false}},
+	{"AVX2", arch{true, true}},
+	{"Generic", arch{false, false}},
 }
 
 var testCases = []struct {
@@ -83,10 +74,12 @@ var testCases = []struct {
 }
 
 func TestHash(t *testing.T) {
-	for i := range providers {
-		p := providers[i]
-		t.Run(p.String()+" digest", func(t *testing.T) {
-			d := NewWith(p)
+	for i, b := range backends {
+		t.Run(b.Name+" digest", func(t *testing.T) {
+			prepareArch(t, backends[i].arch)
+
+			fmt.Println("FEATURES:", cpu.X86.HasAVX, cpu.X86.HasAVX2)
+			d := New()
 			for _, tc := range testCases {
 				d.Reset()
 				_, _ = d.Write(tc.input)
@@ -97,6 +90,20 @@ func TestHash(t *testing.T) {
 	}
 }
 
+func prepareArch(t testing.TB, b arch) {
+	realCPU := cpu.X86
+	if !realCPU.HasAVX2 && b.HasAVX2 || !realCPU.HasAVX && b.HasAVX {
+		t.Skip("Underlying CPU doesn't support necessary features")
+	} else {
+		t.Cleanup(func() {
+			cpu.X86.HasAVX = realCPU.HasAVX
+			cpu.X86.HasAVX2 = realCPU.HasAVX2
+		})
+		cpu.X86.HasAVX = b.HasAVX
+		cpu.X86.HasAVX2 = b.HasAVX2
+	}
+}
+
 func newBuffer() (data []byte) {
 	data = make([]byte, benchDataSize)
 
@@ -110,20 +117,20 @@ func newBuffer() (data []byte) {
 
 func BenchmarkSum(b *testing.B) {
 	data := newBuffer()
-	size := int64(len(data))
 
-	for i := range providers {
-		p := providers[i]
-		b.Run(p.String()+" digest", func(b *testing.B) {
+	for i := range backends {
+		b.Run(backends[i].Name+" digest", func(b *testing.B) {
+			prepareArch(b, backends[i].arch)
+
 			b.ResetTimer()
 			b.ReportAllocs()
-			d := NewWith(p)
+			d := New()
 			for i := 0; i < b.N; i++ {
 				d.Reset()
 				_, _ = d.Write(data)
 				d.Sum(nil)
 			}
-			b.SetBytes(size)
+			b.SetBytes(int64(len(data)))
 		})
 	}
 }
diff --git a/tz/pure.go b/tz/pure.go
deleted file mode 100644
index af9b3a7..0000000
--- a/tz/pure.go
+++ /dev/null
@@ -1,92 +0,0 @@
-package tz
-
-import (
-	"github.com/nspcc-dev/tzhash/gf127"
-)
-
-type digestp struct {
-	x [4]GF127
-}
-
-// New returns a new hash.Hash computing the Tillich-Zémor checksum.
-func newPure() *digestp {
-	d := new(digestp)
-	d.Reset()
-	return d
-}
-
-func (d *digestp) Sum(in []byte) []byte {
-	// Make a copy of d so that caller can keep writing and summing.
-	d0 := *d
-	h := d0.checkSum()
-	return append(in, h[:]...)
-}
-
-func (d *digestp) checkSum() [Size]byte {
-	return d.byteArray()
-}
-
-func (d *digestp) byteArray() (b [Size]byte) {
-	for i := 0; i < 4; i++ {
-		t := d.x[i].ByteArray()
-		copy(b[i*16:], t[:])
-	}
-	return
-}
-
-func (d *digestp) Reset() {
-	d.x[0] = GF127{1, 0}
-	d.x[1] = GF127{0, 0}
-	d.x[2] = GF127{0, 0}
-	d.x[3] = GF127{1, 0}
-}
-
-func (d *digestp) Write(data []byte) (n int, err error) {
-	n = len(data)
-	tmp := new(GF127)
-	for _, b := range data {
-		mulBitRightPure(&d.x[0], &d.x[1], &d.x[2], &d.x[3], b&0x80 != 0, tmp)
-		mulBitRightPure(&d.x[0], &d.x[1], &d.x[2], &d.x[3], b&0x40 != 0, tmp)
-		mulBitRightPure(&d.x[0], &d.x[1], &d.x[2], &d.x[3], b&0x20 != 0, tmp)
-		mulBitRightPure(&d.x[0], &d.x[1], &d.x[2], &d.x[3], b&0x10 != 0, tmp)
-		mulBitRightPure(&d.x[0], &d.x[1], &d.x[2], &d.x[3], b&0x08 != 0, tmp)
-		mulBitRightPure(&d.x[0], &d.x[1], &d.x[2], &d.x[3], b&0x04 != 0, tmp)
-		mulBitRightPure(&d.x[0], &d.x[1], &d.x[2], &d.x[3], b&0x02 != 0, tmp)
-		mulBitRightPure(&d.x[0], &d.x[1], &d.x[2], &d.x[3], b&0x01 != 0, tmp)
-	}
-	return
-}
-
-func (d *digestp) Size() int {
-	return Size
-}
-
-func (d *digestp) BlockSize() int {
-	return hashBlockSize
-}
-
-func mulBitRightPure(c00, c01, c10, c11 *GF127, bit bool, tmp *GF127) {
-	if bit {
-		*tmp = *c00
-		gf127.Mul10(c00, c00)
-		gf127.Add(c00, c01, c00)
-		gf127.Mul11(tmp, tmp)
-		gf127.Add(c01, tmp, c01)
-
-		*tmp = *c10
-		gf127.Mul10(c10, c10)
-		gf127.Add(c10, c11, c10)
-		gf127.Mul11(tmp, tmp)
-		gf127.Add(c11, tmp, c11)
-	} else {
-		*tmp = *c00
-		gf127.Mul10(c00, c00)
-		gf127.Add(c00, c01, c00)
-		*c01 = *tmp
-
-		*tmp = *c10
-		gf127.Mul10(c10, c10)
-		gf127.Add(c10, c11, c10)
-		*c11 = *tmp
-	}
-}
diff --git a/tz/sl2.go b/tz/sl2.go
index 14f963b..cc45281 100644
--- a/tz/sl2.go
+++ b/tz/sl2.go
@@ -17,11 +17,13 @@ var id = sl2{
 	{GF127{0, 0}, GF127{1, 0}},
 }
 
+// MarshalBinary implements encoding.BinaryMarshaler.
 func (c *sl2) MarshalBinary() (data []byte, err error) {
 	s := c.ByteArray()
 	return s[:], nil
 }
 
+// UnmarshalBinary implements encoding.BinaryUnmarshaler.
 func (c *sl2) UnmarshalBinary(data []byte) (err error) {
 	if len(data) != 64 {
 		return errors.New("data must be 64-bytes long")
@@ -113,6 +115,7 @@ func (c *sl2) MulB() *sl2 {
 	return c
 }
 
+// Mul returns a * b in GL_2(GF(2^127))
 func (c *sl2) Mul(a, b *sl2) *sl2 {
 	var x [4]GF127
 
@@ -158,16 +161,16 @@ func (c *sl2) String() string {
 
 func (c *sl2) ByteArray() (b [Size]byte) {
 	t := c[0][0].ByteArray()
-	copy(b[:], t)
+	copy(b[:], t[:])
 
 	t = c[0][1].ByteArray()
-	copy(b[16:], t)
+	copy(b[16:], t[:])
 
 	t = c[1][0].ByteArray()
-	copy(b[32:], t)
+	copy(b[32:], t[:])
 
 	t = c[1][1].ByteArray()
-	copy(b[48:], t)
+	copy(b[48:], t[:])
 
 	return
 }