From 026731b260faae40908961a9799c30276511442a Mon Sep 17 00:00:00 2001
From: Evgenii Stratonikov <evgeniy@nspcc.ru>
Date: Wed, 9 Mar 2022 15:17:18 +0300
Subject: [PATCH] gf127: use build tags for different implemenations

Signed-off-by: Evgenii Stratonikov <evgeniy@nspcc.ru>
---
 gf127/arith.go                         | 133 -----------------------
 gf127/avx/gf127.go                     |  32 ------
 gf127/avx/gf127_test.go                |  68 ------------
 gf127/doc.go                           |   7 ++
 gf127/gf127.go                         | 134 +++++++++++++++++++++--
 gf127/gf127_amd64.go                   |  55 ++++++++++
 gf127/{avx => }/gf127_amd64.s          |   8 +-
 gf127/gf127_generic.go                 |  24 +++++
 gf127/{arith_test.go => gf127_test.go} |   0
 gf127/{avx2 => }/gf127x2.go            |  23 ++--
 gf127/gf127x2_amd64.go                 |  27 +++++
 gf127/{avx2 => }/gf127x2_amd64.s       |   4 +-
 gf127/gf127x2_generic.go               |  14 +++
 gf127/{avx2 => }/gf127x2_test.go       |   4 +-
 tz/avx2.go                             |  10 +-
 tz/avx2_inline.go                      |  10 +-
 tz/sl2.go                              | 143 ++++++++++---------------
 tz/sl2_test.go                         |   7 +-
 18 files changed, 342 insertions(+), 361 deletions(-)
 delete mode 100644 gf127/arith.go
 delete mode 100644 gf127/avx/gf127.go
 delete mode 100644 gf127/avx/gf127_test.go
 create mode 100644 gf127/doc.go
 create mode 100644 gf127/gf127_amd64.go
 rename gf127/{avx => }/gf127_amd64.s (93%)
 create mode 100644 gf127/gf127_generic.go
 rename gf127/{arith_test.go => gf127_test.go} (100%)
 rename gf127/{avx2 => }/gf127x2.go (81%)
 create mode 100644 gf127/gf127x2_amd64.go
 rename gf127/{avx2 => }/gf127x2_amd64.s (90%)
 create mode 100644 gf127/gf127x2_generic.go
 rename gf127/{avx2 => }/gf127x2_test.go (97%)

diff --git a/gf127/arith.go b/gf127/arith.go
deleted file mode 100644
index 1a5243f..0000000
--- a/gf127/arith.go
+++ /dev/null
@@ -1,133 +0,0 @@
-// Copyright 2019 (c) NSPCC
-//
-// Package gf127 implements the GF(2^127) arithmetic
-// modulo reduction polynomial x^127 + x^63 + 1 .
-// Implementation is in pure Go.
-package gf127
-
-import (
-	"math/bits"
-)
-
-var (
-	// x126x631 is reduction polynomial x^127+x^63+1
-	x127x631 = GF127{msb64 + 1, msb64}
-)
-
-// New constructs new element of GF(2^127) as hi*x^64 + lo.
-// It is assumed that hi has zero MSB.
-func New(lo, hi uint64) *GF127 {
-	return &GF127{lo, hi}
-}
-
-// Inv sets b to a^-1
-// Algorithm is based on Extended Euclidean Algorithm
-// and is described by Hankerson, Hernandez, Menezes in
-// https://link.springer.com/content/pdf/10.1007/3-540-44499-8_1.pdf
-func Inv(a, b *GF127) {
-	var (
-		v    = x127x631
-		u    = *a
-		c, d = &GF127{1, 0}, &GF127{0, 0}
-		t    = new(GF127)
-		x    *GF127
-	)
-
-	// degree of polynomial is a position of most significant bit
-	for du, dv := msb(&u), msb(&v); du != 0; du, dv = msb(&u), msb(&v) {
-		if du < dv {
-			v, u = u, v
-			dv, du = du, dv
-			d, c = c, d
-		}
-
-		x = xN(du - dv)
-
-		Mul(x, &v, t)
-		Add(&u, t, &u)
-
-		// becasuse mul performs reduction on t, we need
-		// manually reduce u at first step
-		if msb(&u) == 127 {
-			Add(&u, &x127x631, &u)
-		}
-
-		Mul(x, d, t)
-		Add(c, t, c)
-	}
-	*b = *c
-}
-
-func xN(n int) *GF127 {
-	if n < 64 {
-		return &GF127{1 << uint(n), 0}
-	}
-	return &GF127{0, 1 << uint(n-64)}
-}
-
-func msb(a *GF127) (x int) {
-	x = bits.LeadingZeros64(a[1])
-	if x == 64 {
-		x = bits.LeadingZeros64(a[0]) + 64
-	}
-	return 127 - x
-}
-
-// Mul1 copies a to b.
-func Mul1(a, b *GF127) {
-	b[0] = a[0]
-	b[1] = a[1]
-}
-
-// And sets c to a & b (bitwise-and).
-func And(a, b, c *GF127) {
-	c[0] = a[0] & b[0]
-	c[1] = a[1] & b[1]
-}
-
-// Add sets c to a+b.
-func Add(a, b, c *GF127) {
-	c[0] = a[0] ^ b[0]
-	c[1] = a[1] ^ b[1]
-}
-
-// Mul sets c to a*b.
-func Mul(a, b, c *GF127) {
-	r := new(GF127)
-	d := *a
-	for i := uint(0); i < 64; i++ {
-		if b[0]&(1<<i) != 0 {
-			Add(r, &d, r)
-		}
-		Mul10(&d, &d)
-	}
-	for i := uint(0); i < 63; i++ {
-		if b[1]&(1<<i) != 0 {
-			Add(r, &d, r)
-		}
-		Mul10(&d, &d)
-	}
-	*c = *r
-}
-
-// Mul10 sets b to a*x.
-func Mul10(a, b *GF127) {
-	c := a[0] >> 63
-	b[0] = a[0] << 1
-	b[1] = (a[1] << 1) ^ c
-
-	mask := b[1] & msb64
-	b[0] ^= mask | (mask >> 63)
-	b[1] ^= mask
-}
-
-// Mul11 sets b to a*(x+1).
-func Mul11(a, b *GF127) {
-	c := a[0] >> 63
-	b[0] = a[0] ^ (a[0] << 1)
-	b[1] = a[1] ^ (a[1] << 1) ^ c
-
-	mask := b[1] & msb64
-	b[0] ^= mask | (mask >> 63)
-	b[1] ^= mask
-}
diff --git a/gf127/avx/gf127.go b/gf127/avx/gf127.go
deleted file mode 100644
index e77aeb2..0000000
--- a/gf127/avx/gf127.go
+++ /dev/null
@@ -1,32 +0,0 @@
-// Copyright 2018 (c) NSPCC
-//
-// Package gf127 implements the GF(2^127) arithmetic
-// modulo reduction polynomial x^127 + x^63 + 1 .
-// This is rather straight-forward re-implementation of C library
-// available here https://github.com/srijs/hwsl2-core .
-// Interfaces are highly influenced by math/big .
-package avx
-
-import (
-	"github.com/nspcc-dev/tzhash/gf127"
-)
-
-// GF127 is an alias for a main type.
-type GF127 = gf127.GF127
-
-const msb64 = uint64(1) << 63
-
-// x127x63 represents x^127 + x^63. Used in assembly file.
-var x127x63 = GF127{msb64, msb64} //nolint:deadcode,varcheck
-
-// Add sets c to a+b.
-func Add(a, b, c *GF127)
-
-// Mul sets c to a*b.
-func Mul(a, b, c *GF127)
-
-// Mul10 sets b to a*x.
-func Mul10(a, b *GF127)
-
-// Mul11 sets b to a*(x+1).
-func Mul11(a, b *GF127)
diff --git a/gf127/avx/gf127_test.go b/gf127/avx/gf127_test.go
deleted file mode 100644
index 9da629c..0000000
--- a/gf127/avx/gf127_test.go
+++ /dev/null
@@ -1,68 +0,0 @@
-package avx
-
-import (
-	"testing"
-
-	"github.com/nspcc-dev/tzhash/gf127"
-	"github.com/stretchr/testify/require"
-)
-
-const maxUint64 = ^uint64(0)
-
-func TestAdd(t *testing.T) {
-	var (
-		a = gf127.Random()
-		b = gf127.Random()
-		e = &GF127{a[0] ^ b[0], a[1] ^ b[1]}
-		c = new(GF127)
-	)
-	Add(a, b, c)
-	require.Equal(t, e, c)
-}
-
-var testCasesMul = [][3]*GF127{
-	// (x+1)*(x^63+x^62+...+1) == x^64+1
-	{&GF127{3, 0}, &GF127{maxUint64, 0}, &GF127{1, 1}},
-
-	// x^126 * x^2 == x^128 == x^64 + x
-	{&GF127{0, 1 << 62}, &GF127{4, 0}, &GF127{2, 1}},
-
-	// (x^64+x^63+1) * (x^64+x) == x^128+x^65+x^127+x^64+x^64+x == x^65+x^64+x^63+1
-	{&GF127{1 + 1<<63, 1}, &GF127{2, 1}, &GF127{0x8000000000000001, 3}},
-}
-
-func TestMul(t *testing.T) {
-	c := new(GF127)
-	for _, tc := range testCasesMul {
-		Mul(tc[0], tc[1], c)
-		require.Equal(t, tc[2], c)
-	}
-}
-
-var testCasesMul10 = [][2]*GF127{
-	{&GF127{123, 0}, &GF127{246, 0}},
-	{&GF127{maxUint64, 2}, &GF127{maxUint64 - 1, 5}},
-	{&GF127{0, maxUint64 >> 1}, &GF127{1 + 1<<63, maxUint64>>1 - 1}},
-}
-
-func TestMul10(t *testing.T) {
-	c := new(GF127)
-	for _, tc := range testCasesMul10 {
-		Mul10(tc[0], c)
-		require.Equal(t, tc[1], c)
-	}
-}
-
-var testCasesMul11 = [][2]*GF127{
-	{&GF127{123, 0}, &GF127{141, 0}},
-	{&GF127{maxUint64, 2}, &GF127{1, 7}},
-	{&GF127{0, maxUint64 >> 1}, &GF127{1 + 1<<63, 1}},
-}
-
-func TestMul11(t *testing.T) {
-	c := new(GF127)
-	for _, tc := range testCasesMul11 {
-		Mul11(tc[0], c)
-		require.Equal(t, tc[1], c)
-	}
-}
diff --git a/gf127/doc.go b/gf127/doc.go
new file mode 100644
index 0000000..1b100c8
--- /dev/null
+++ b/gf127/doc.go
@@ -0,0 +1,7 @@
+// Package gf127 implements the GF(2^127) arithmetic
+// modulo reduction polynomial x^127 + x^63 + 1 .
+// gf127.go contains common definitions.
+// Other files contain architecture-specific implementations.
+//
+// Copyright 2019 (c) NSPCC
+package gf127
diff --git a/gf127/gf127.go b/gf127/gf127.go
index de3c276..414f795 100644
--- a/gf127/gf127.go
+++ b/gf127/gf127.go
@@ -4,17 +4,135 @@ import (
 	"encoding/binary"
 	"encoding/hex"
 	"errors"
+	"math/bits"
 	"math/rand"
 )
 
+// GF127 represents element of GF(2^127)
+type GF127 [2]uint64
+
 const (
 	byteSize  = 16
 	maxUint64 = ^uint64(0)
 	msb64     = uint64(1) << 63
 )
 
-// GF127 represents element of GF(2^127)
-type GF127 [2]uint64
+// x127x631 is reduction polynomial x^127 + x^63 + 1
+var x127x631 = GF127{msb64 + 1, msb64}
+
+// New constructs new element of GF(2^127) as hi*x^64 + lo.
+// It is assumed that hi has zero MSB.
+func New(lo, hi uint64) *GF127 {
+	return &GF127{lo, hi}
+}
+
+func addGeneric(a, b, c *GF127) {
+	c[0] = a[0] ^ b[0]
+	c[1] = a[1] ^ b[1]
+}
+
+func mulGeneric(a, b, c *GF127) {
+	r := new(GF127)
+	d := *a
+	for i := uint(0); i < 64; i++ {
+		if b[0]&(1<<i) != 0 {
+			addGeneric(r, &d, r)
+		}
+		mul10Generic(&d, &d)
+	}
+	for i := uint(0); i < 63; i++ {
+		if b[1]&(1<<i) != 0 {
+			addGeneric(r, &d, r)
+		}
+		mul10Generic(&d, &d)
+	}
+	*c = *r
+}
+
+func mul10Generic(a, b *GF127) {
+	c := a[0] >> 63
+	b[0] = a[0] << 1
+	b[1] = (a[1] << 1) ^ c
+
+	mask := b[1] & msb64
+	b[0] ^= mask | (mask >> 63)
+	b[1] ^= mask
+}
+
+func mul11Generic(a, b *GF127) {
+	c := a[0] >> 63
+	b[0] = a[0] ^ (a[0] << 1)
+	b[1] = a[1] ^ (a[1] << 1) ^ c
+
+	mask := b[1] & msb64
+	b[0] ^= mask | (mask >> 63)
+	b[1] ^= mask
+}
+
+// Inv sets b to a^-1
+// Algorithm is based on Extended Euclidean Algorithm
+// and is described by Hankerson, Hernandez, Menezes in
+// https://link.springer.com/content/pdf/10.1007/3-540-44499-8_1.pdf
+func Inv(a, b *GF127) {
+	var (
+		v    = x127x631
+		u    = *a
+		c, d = &GF127{1, 0}, &GF127{0, 0}
+		t    = new(GF127)
+		x    *GF127
+	)
+
+	// degree of polynomial is a position of most significant bit
+	for du, dv := msb(&u), msb(&v); du != 0; du, dv = msb(&u), msb(&v) {
+		if du < dv {
+			v, u = u, v
+			dv, du = du, dv
+			d, c = c, d
+		}
+
+		x = xN(du - dv)
+
+		Mul(x, &v, t)
+		Add(&u, t, &u)
+
+		// becasuse mulAVX performs reduction on t, we need
+		// manually reduce u at first step
+		if msb(&u) == 127 {
+			Add(&u, &x127x631, &u)
+		}
+
+		Mul(x, d, t)
+		Add(c, t, c)
+	}
+	*b = *c
+}
+
+func xN(n int) *GF127 {
+	if n < 64 {
+		return &GF127{1 << uint(n), 0}
+	}
+	return &GF127{0, 1 << uint(n-64)}
+}
+
+func msb(a *GF127) (x int) {
+	x = bits.LeadingZeros64(a[1])
+	if x == 64 {
+		x = bits.LeadingZeros64(a[0]) + 64
+	}
+	return 127 - x
+}
+
+// Mul1 copies b into a.
+func Mul1(a, b *GF127) {
+	a[0] = b[0]
+	a[1] = b[1]
+}
+
+// And sets c to a & b (bitwise-and).
+func And(a, b, c *GF127) {
+	c[0] = a[0] & b[0]
+	c[1] = a[1] & b[1]
+}
 
 // Random returns random element from GF(2^127).
 // Is used mostly for testing.
@@ -24,7 +142,8 @@ func Random() *GF127 {
 
 // String returns hex-encoded representation, starting with MSB.
 func (c *GF127) String() string {
-	return hex.EncodeToString(c.ByteArray())
+	buf := c.ByteArray()
+	return hex.EncodeToString(buf[:])
 }
 
 // Equals checks if two reduced (zero MSB) elements of GF(2^127) are equal
@@ -33,16 +152,17 @@ func (c *GF127) Equals(b *GF127) bool {
 }
 
 // ByteArray represents element of GF(2^127) as byte array of length 16.
-func (c *GF127) ByteArray() (buf []byte) {
-	buf = make([]byte, 16)
+func (c *GF127) ByteArray() []byte {
+	buf := make([]byte, 16)
 	binary.BigEndian.PutUint64(buf[:8], c[1])
 	binary.BigEndian.PutUint64(buf[8:], c[0])
-	return
+	return buf
 }
 
 // MarshalBinary implements encoding.BinaryMarshaler.
 func (c *GF127) MarshalBinary() (data []byte, err error) {
-	return c.ByteArray(), nil
+	buf := c.ByteArray()
+	return buf[:], nil
 }
 
 // UnmarshalBinary implements encoding.BinaryUnmarshaler.
diff --git a/gf127/gf127_amd64.go b/gf127/gf127_amd64.go
new file mode 100644
index 0000000..186666a
--- /dev/null
+++ b/gf127/gf127_amd64.go
@@ -0,0 +1,55 @@
+//go:build amd64 && !generic
+// +build amd64,!generic
+
+// Package gf127 implements the GF(2^127) arithmetic
+// modulo reduction polynomial x^127 + x^63 + 1 .
+// This is rather straight-forward re-implementation of C library
+// available here https://github.com/srijs/hwsl2-core .
+// Interfaces are highly influenced by math/big .
+package gf127
+
+import "golang.org/x/sys/cpu"
+
+// x127x63 represents x^127 + x^63
+var x127x63 = GF127{msb64, msb64} //nolint:deadcode,varcheck
+
+// Add sets c to a+b.
+func Add(a, b, c *GF127) {
+	if cpu.X86.HasAVX {
+		addAVX(a, b, c)
+	} else {
+		addGeneric(a, b, c)
+	}
+}
+
+// Mul sets c to a*b.
+func Mul(a, b, c *GF127) {
+	if cpu.X86.HasAVX {
+		mulAVX(a, b, c)
+	} else {
+		mulGeneric(a, b, c)
+	}
+}
+
+// Mul10 sets b to a*x.
+func Mul10(a, b *GF127) {
+	if cpu.X86.HasAVX {
+		mul10AVX(a, b)
+	} else {
+		mul10Generic(a, b)
+	}
+}
+
+// Mul11 sets b to a*(x+1).
+func Mul11(a, b *GF127) {
+	if cpu.X86.HasAVX {
+		mul11AVX(a, b)
+	} else {
+		mul11Generic(a, b)
+	}
+}
+
+func addAVX(a, b, c *GF127)
+func mulAVX(a, b, c *GF127)
+func mul10AVX(a, b *GF127)
+func mul11AVX(a, b *GF127)
diff --git a/gf127/avx/gf127_amd64.s b/gf127/gf127_amd64.s
similarity index 93%
rename from gf127/avx/gf127_amd64.s
rename to gf127/gf127_amd64.s
index 281efc6..3e64b50 100644
--- a/gf127/avx/gf127_amd64.s
+++ b/gf127/gf127_amd64.s
@@ -1,7 +1,7 @@
 #include "textflag.h"
 
 // func Add(a, b, c *[2]uint64)
-TEXT ·Add(SB), NOSPLIT, $0
+TEXT ·addAVX(SB), NOSPLIT, $0
 	MOVQ   a+0(FP), AX
 	MOVUPD (AX), X0
 	MOVQ   b+8(FP), BX
@@ -12,7 +12,7 @@ TEXT ·Add(SB), NOSPLIT, $0
 	RET
 
 // func Mul10(a, b *[2]uint64)
-TEXT ·Mul10(SB), NOSPLIT, $0
+TEXT ·mul10AVX(SB), NOSPLIT, $0
 	MOVQ        a+0(FP), AX
 	MOVUPD      (AX), X0
 	VPSLLQ      $1, X0, X1
@@ -28,7 +28,7 @@ TEXT ·Mul10(SB), NOSPLIT, $0
 	RET
 
 // func Mul11(a, b *[2]uint64)
-TEXT ·Mul11(SB), NOSPLIT, $0
+TEXT ·mul11AVX(SB), NOSPLIT, $0
 	MOVQ        a+0(FP), AX
 	MOVUPD      (AX), X0
 	VPSLLQ      $1, X0, X1
@@ -45,7 +45,7 @@ TEXT ·Mul11(SB), NOSPLIT, $0
 	RET
 
 // func Mul(a, b, c *[2]uint64)
-TEXT ·Mul(SB), NOSPLIT, $0
+TEXT ·mulAVX(SB), NOSPLIT, $0
 	MOVQ        a+0(FP), AX       // X0 = a0 . a1
 	MOVUPD      (AX), X0          // X0 = a0 . a1
 	MOVQ        b+8(FP), BX       // X1 = b0 . b1
diff --git a/gf127/gf127_generic.go b/gf127/gf127_generic.go
new file mode 100644
index 0000000..33919d9
--- /dev/null
+++ b/gf127/gf127_generic.go
@@ -0,0 +1,24 @@
+//go:build !amd64 || generic
+// +build !amd64 generic
+
+package gf127
+
+// Add sets c to a+b.
+func Add(a, b, c *GF127) {
+	addGeneric(a, b, c)
+}
+
+// Mul sets c to a*b.
+func Mul(a, b, c *GF127) {
+	mulGeneric(a, b, c)
+}
+
+// Mul10 sets b to a*x.
+func Mul10(a, b *GF127) {
+	mul10Generic(a, b)
+}
+
+// Mul11 sets b to a*(x+1).
+func Mul11(a, b *GF127) {
+	mul11Generic(a, b)
+}
diff --git a/gf127/arith_test.go b/gf127/gf127_test.go
similarity index 100%
rename from gf127/arith_test.go
rename to gf127/gf127_test.go
diff --git a/gf127/avx2/gf127x2.go b/gf127/gf127x2.go
similarity index 81%
rename from gf127/avx2/gf127x2.go
rename to gf127/gf127x2.go
index 7dbca58..12d5cef 100644
--- a/gf127/avx2/gf127x2.go
+++ b/gf127/gf127x2.go
@@ -1,18 +1,23 @@
-package avx2
+package gf127
 
 import (
 	"encoding/binary"
 	"encoding/hex"
-
-	"github.com/nspcc-dev/tzhash/gf127"
 )
 
-// GF127 is an alias for a main type.
-type GF127 = gf127.GF127
-
 // GF127x2 represents a pair of elements of GF(2^127) stored together.
 type GF127x2 [2]GF127
 
+func mul10x2Generic(a, b *GF127x2) {
+	mul10Generic(&a[0], &b[0])
+	mul10Generic(&a[1], &b[1])
+}
+
+func mul11x2Generic(a, b *GF127x2) {
+	mul11Generic(&a[0], &b[0])
+	mul11Generic(&a[1], &b[1])
+}
+
 // Split returns 2 components of pair without additional allocations.
 func Split(a *GF127x2) (*GF127, *GF127) {
 	return &a[0], &a[1]
@@ -45,9 +50,3 @@ func (a *GF127x2) ByteArray() (buf []byte) {
 	binary.BigEndian.PutUint64(buf[24:], a[1][0])
 	return
 }
-
-// Mul10x2 sets (b1, b2) to (a1*x, a2*x)
-func Mul10x2(a, b *GF127x2)
-
-// Mul10x2 sets (b1, b2) to (a1*(x+1), a2*(x+1))
-func Mul11x2(a, b *GF127x2)
diff --git a/gf127/gf127x2_amd64.go b/gf127/gf127x2_amd64.go
new file mode 100644
index 0000000..6cf8787
--- /dev/null
+++ b/gf127/gf127x2_amd64.go
@@ -0,0 +1,27 @@
+//go:build amd64 && !generic
+// +build amd64,!generic
+
+package gf127
+
+import "golang.org/x/sys/cpu"
+
+// Mul10x2 sets (b1, b2) to (a1*x, a2*x)
+func Mul10x2(a, b *GF127x2) {
+	if cpu.X86.HasAVX && cpu.X86.HasAVX2 {
+		mul10x2AVX2(a, b)
+	} else {
+		mul10x2Generic(a, b)
+	}
+}
+
+// Mul11x2 sets (b1, b2) to (a1*(x+1), a2*(x+1))
+func Mul11x2(a, b *GF127x2) {
+	if cpu.X86.HasAVX && cpu.X86.HasAVX2 {
+		mul11x2AVX2(a, b)
+	} else {
+		mul11x2Generic(a, b)
+	}
+}
+
+func mul10x2AVX2(a, b *GF127x2)
+func mul11x2AVX2(a, b *GF127x2)
diff --git a/gf127/avx2/gf127x2_amd64.s b/gf127/gf127x2_amd64.s
similarity index 90%
rename from gf127/avx2/gf127x2_amd64.s
rename to gf127/gf127x2_amd64.s
index ac708e5..9596be5 100644
--- a/gf127/avx2/gf127x2_amd64.s
+++ b/gf127/gf127x2_amd64.s
@@ -1,7 +1,7 @@
 #include "textflag.h"
 
 // func Mul10x2(a, b) *[4]uint64
-TEXT ·Mul10x2(SB), NOSPLIT, $0
+TEXT ·mul10x2AVX2(SB), NOSPLIT, $0
 	MOVQ        a+0(FP), AX
 	VMOVDQA     (AX), Y0
 	VPSLLQ      $1, Y0, Y1
@@ -17,7 +17,7 @@ TEXT ·Mul10x2(SB), NOSPLIT, $0
 	RET
 
 // func Mul11x2(a, b) *[4]uint64
-TEXT ·Mul11x2(SB), NOSPLIT, $0
+TEXT ·mul11x2AVX2(SB), NOSPLIT, $0
 	MOVQ        a+0(FP), AX
 	VMOVDQA     (AX), Y0
 	VPSLLQ      $1, Y0, Y1
diff --git a/gf127/gf127x2_generic.go b/gf127/gf127x2_generic.go
new file mode 100644
index 0000000..4ca4c36
--- /dev/null
+++ b/gf127/gf127x2_generic.go
@@ -0,0 +1,14 @@
+//go:build !(amd64 && !generic)
+// +build !amd64 generic
+
+package gf127
+
+// Mul10x2 sets (b1, b2) to (a1*x, a2*x)
+func Mul10x2(a, b *GF127x2) {
+	mul10x2Generic(a, b)
+}
+
+// Mul11x2 sets (b1, b2) to (a1*(x+1), a2*(x+1))
+func Mul11x2(a, b *GF127x2) {
+	mul11x2Generic(a, b)
+}
diff --git a/gf127/avx2/gf127x2_test.go b/gf127/gf127x2_test.go
similarity index 97%
rename from gf127/avx2/gf127x2_test.go
rename to gf127/gf127x2_test.go
index 5de5865..73a4a68 100644
--- a/gf127/avx2/gf127x2_test.go
+++ b/gf127/gf127x2_test.go
@@ -1,4 +1,4 @@
-package avx2
+package gf127
 
 import (
 	"testing"
@@ -6,8 +6,6 @@ import (
 	"github.com/stretchr/testify/require"
 )
 
-const maxUint64 = ^uint64(0)
-
 var testCasesSplit = []struct {
 	num *GF127x2
 	h1  *GF127
diff --git a/tz/avx2.go b/tz/avx2.go
index 48c7025..b41c182 100644
--- a/tz/avx2.go
+++ b/tz/avx2.go
@@ -3,11 +3,11 @@ package tz
 import (
 	"hash"
 
-	"github.com/nspcc-dev/tzhash/gf127/avx2"
+	"github.com/nspcc-dev/tzhash/gf127"
 )
 
 type digest2 struct {
-	x [2]avx2.GF127x2
+	x [2]gf127.GF127x2
 }
 
 // type assertion
@@ -41,8 +41,8 @@ func (d *digest2) Sum(in []byte) []byte {
 	return append(in, h[:]...)
 }
 func (d *digest2) Reset() {
-	d.x[0] = avx2.GF127x2{GF127{1, 0}, GF127{0, 0}}
-	d.x[1] = avx2.GF127x2{GF127{0, 0}, GF127{1, 0}}
+	d.x[0] = gf127.GF127x2{GF127{1, 0}, GF127{0, 0}}
+	d.x[1] = gf127.GF127x2{GF127{0, 0}, GF127{1, 0}}
 }
 func (d *digest2) Size() int      { return Size }
 func (d *digest2) BlockSize() int { return hashBlockSize }
@@ -59,4 +59,4 @@ func (d *digest2) checkSum() (b [Size]byte) {
 	return
 }
 
-func mulBitRightx2(c00c10 *avx2.GF127x2, c01c11 *avx2.GF127x2, e *GF127)
+func mulBitRightx2(c00c10 *gf127.GF127x2, c01c11 *gf127.GF127x2, e *GF127)
diff --git a/tz/avx2_inline.go b/tz/avx2_inline.go
index eb30a9a..f5fcecf 100644
--- a/tz/avx2_inline.go
+++ b/tz/avx2_inline.go
@@ -3,11 +3,11 @@ package tz
 import (
 	"hash"
 
-	"github.com/nspcc-dev/tzhash/gf127/avx2"
+	"github.com/nspcc-dev/tzhash/gf127"
 )
 
 type digest3 struct {
-	x [2]avx2.GF127x2
+	x [2]gf127.GF127x2
 }
 
 // type assertion
@@ -34,8 +34,8 @@ func (d *digest3) Sum(in []byte) []byte {
 	return append(in, h[:]...)
 }
 func (d *digest3) Reset() {
-	d.x[0] = avx2.GF127x2{GF127{1, 0}, GF127{0, 0}}
-	d.x[1] = avx2.GF127x2{GF127{0, 0}, GF127{1, 0}}
+	d.x[0] = gf127.GF127x2{GF127{1, 0}, GF127{0, 0}}
+	d.x[1] = gf127.GF127x2{GF127{0, 0}, GF127{1, 0}}
 }
 func (d *digest3) Size() int      { return Size }
 func (d *digest3) BlockSize() int { return hashBlockSize }
@@ -52,4 +52,4 @@ func (d *digest3) checkSum() (b [Size]byte) {
 	return
 }
 
-func mulByteSliceRightx2(c00c10 *avx2.GF127x2, c01c11 *avx2.GF127x2, n int, data *byte)
+func mulByteSliceRightx2(c00c10 *gf127.GF127x2, c01c11 *gf127.GF127x2, n int, data *byte)
diff --git a/tz/sl2.go b/tz/sl2.go
index 4676eb1..14f963b 100644
--- a/tz/sl2.go
+++ b/tz/sl2.go
@@ -4,7 +4,6 @@ import (
 	"errors"
 
 	"github.com/nspcc-dev/tzhash/gf127"
-	"github.com/nspcc-dev/tzhash/gf127/avx"
 )
 
 type (
@@ -18,16 +17,6 @@ var id = sl2{
 	{GF127{0, 0}, GF127{1, 0}},
 }
 
-var mul func(a, b, c *sl2, x *[4]GF127)
-
-func init() {
-	if hasAVX {
-		mul = mulSL2AVX
-	} else {
-		mul = mulSL2Pure
-	}
-}
-
 func (c *sl2) MarshalBinary() (data []byte, err error) {
 	s := c.ByteArray()
 	return s[:], nil
@@ -56,86 +45,54 @@ func (c *sl2) UnmarshalBinary(data []byte) (err error) {
 
 func (c *sl2) mulStrassen(a, b *sl2, x *[8]GF127) *sl2 { //nolint:unused
 	// strassen algorithm
-	avx.Add(&a[0][0], &a[1][1], &x[0])
-	avx.Add(&b[0][0], &b[1][1], &x[1])
-	avx.Mul(&x[0], &x[1], &x[0])
+	gf127.Add(&a[0][0], &a[1][1], &x[0])
+	gf127.Add(&b[0][0], &b[1][1], &x[1])
+	gf127.Mul(&x[0], &x[1], &x[0])
 
-	avx.Add(&a[1][0], &a[1][1], &x[1])
-	avx.Mul(&x[1], &b[0][0], &x[1])
+	gf127.Add(&a[1][0], &a[1][1], &x[1])
+	gf127.Mul(&x[1], &b[0][0], &x[1])
 
-	avx.Add(&b[0][1], &b[1][1], &x[2])
-	avx.Mul(&x[2], &a[0][0], &x[2])
+	gf127.Add(&b[0][1], &b[1][1], &x[2])
+	gf127.Mul(&x[2], &a[0][0], &x[2])
 
-	avx.Add(&b[1][0], &b[0][0], &x[3])
-	avx.Mul(&x[3], &a[1][1], &x[3])
+	gf127.Add(&b[1][0], &b[0][0], &x[3])
+	gf127.Mul(&x[3], &a[1][1], &x[3])
 
-	avx.Add(&a[0][0], &a[0][1], &x[4])
-	avx.Mul(&x[4], &b[1][1], &x[4])
+	gf127.Add(&a[0][0], &a[0][1], &x[4])
+	gf127.Mul(&x[4], &b[1][1], &x[4])
 
-	avx.Add(&a[1][0], &a[0][0], &x[5])
-	avx.Add(&b[0][0], &b[0][1], &x[6])
-	avx.Mul(&x[5], &x[6], &x[5])
+	gf127.Add(&a[1][0], &a[0][0], &x[5])
+	gf127.Add(&b[0][0], &b[0][1], &x[6])
+	gf127.Mul(&x[5], &x[6], &x[5])
 
-	avx.Add(&a[0][1], &a[1][1], &x[6])
-	avx.Add(&b[1][0], &b[1][1], &x[7])
-	avx.Mul(&x[6], &x[7], &x[6])
+	gf127.Add(&a[0][1], &a[1][1], &x[6])
+	gf127.Add(&b[1][0], &b[1][1], &x[7])
+	gf127.Mul(&x[6], &x[7], &x[6])
 
-	avx.Add(&x[2], &x[4], &c[0][1])
-	avx.Add(&x[1], &x[3], &c[1][0])
+	gf127.Add(&x[2], &x[4], &c[0][1])
+	gf127.Add(&x[1], &x[3], &c[1][0])
 
-	avx.Add(&x[4], &x[6], &x[4])
-	avx.Add(&x[0], &x[3], &c[0][0])
-	avx.Add(&c[0][0], &x[4], &c[0][0])
+	gf127.Add(&x[4], &x[6], &x[4])
+	gf127.Add(&x[0], &x[3], &c[0][0])
+	gf127.Add(&c[0][0], &x[4], &c[0][0])
 
-	avx.Add(&x[0], &x[1], &x[0])
-	avx.Add(&x[2], &x[5], &c[1][1])
-	avx.Add(&c[1][1], &x[0], &c[1][1])
+	gf127.Add(&x[0], &x[1], &x[0])
+	gf127.Add(&x[2], &x[5], &c[1][1])
+	gf127.Add(&c[1][1], &x[0], &c[1][1])
 
 	return c
 }
 
-func mulSL2AVX(a, b, c *sl2, x *[4]GF127) {
-	avx.Mul(&a[0][0], &b[0][0], &x[0])
-	avx.Mul(&a[0][0], &b[0][1], &x[1])
-	avx.Mul(&a[1][0], &b[0][0], &x[2])
-	avx.Mul(&a[1][0], &b[0][1], &x[3])
-
-	avx.Mul(&a[0][1], &b[1][0], &c[0][0])
-	avx.Add(&c[0][0], &x[0], &c[0][0])
-	avx.Mul(&a[0][1], &b[1][1], &c[0][1])
-	avx.Add(&c[0][1], &x[1], &c[0][1])
-	avx.Mul(&a[1][1], &b[1][0], &c[1][0])
-	avx.Add(&c[1][0], &x[2], &c[1][0])
-	avx.Mul(&a[1][1], &b[1][1], &c[1][1])
-	avx.Add(&c[1][1], &x[3], &c[1][1])
-}
-
-func mulSL2Pure(a, b, c *sl2, x *[4]GF127) {
-	gf127.Mul((*GF127)(&a[0][0]), (*GF127)(&b[0][0]), (*GF127)(&x[0]))
-	gf127.Mul((*GF127)(&a[0][0]), (*GF127)(&b[0][1]), (*GF127)(&x[1]))
-	gf127.Mul((*GF127)(&a[1][0]), (*GF127)(&b[0][0]), (*GF127)(&x[2]))
-	gf127.Mul((*GF127)(&a[1][0]), (*GF127)(&b[0][1]), (*GF127)(&x[3]))
-
-	gf127.Mul((*GF127)(&a[0][1]), (*GF127)(&b[1][0]), (*GF127)(&c[0][0]))
-	gf127.Add((*GF127)(&c[0][0]), (*GF127)(&x[0]), (*GF127)(&c[0][0]))
-	gf127.Mul((*GF127)(&a[0][1]), (*GF127)(&b[1][1]), (*GF127)(&c[0][1]))
-	gf127.Add((*GF127)(&c[0][1]), (*GF127)(&x[1]), (*GF127)(&c[0][1]))
-	gf127.Mul((*GF127)(&a[1][1]), (*GF127)(&b[1][0]), (*GF127)(&c[1][0]))
-	gf127.Add((*GF127)(&c[1][0]), (*GF127)(&x[2]), (*GF127)(&c[1][0]))
-	gf127.Mul((*GF127)(&a[1][1]), (*GF127)(&b[1][1]), (*GF127)(&c[1][1]))
-	gf127.Add((*GF127)(&c[1][1]), (*GF127)(&x[3]), (*GF127)(&c[1][1]))
-}
-
 func (c *sl2) MulA() *sl2 {
 	var a GF127
 
-	avx.Mul10(&c[0][0], &a)
+	gf127.Mul10(&c[0][0], &a)
 	gf127.Mul1(&c[0][0], &c[0][1])
-	avx.Add(&a, &c[0][1], &c[0][0])
+	gf127.Add(&a, &c[0][1], &c[0][0])
 
-	avx.Mul10(&c[1][0], &a)
+	gf127.Mul10(&c[1][0], &a)
 	gf127.Mul1(&c[1][0], &c[1][1])
-	avx.Add(&a, &c[1][1], &c[1][0])
+	gf127.Add(&a, &c[1][1], &c[1][0])
 
 	return c
 }
@@ -144,20 +101,34 @@ func (c *sl2) MulB() *sl2 {
 	var a GF127
 
 	gf127.Mul1(&c[0][0], &a)
-	avx.Mul10(&c[0][0], &c[0][0])
-	avx.Add(&c[0][1], &c[0][0], &c[0][0])
-	avx.Add(&c[0][0], &a, &c[0][1])
+	gf127.Mul10(&c[0][0], &c[0][0])
+	gf127.Add(&c[0][1], &c[0][0], &c[0][0])
+	gf127.Add(&c[0][0], &a, &c[0][1])
 
 	gf127.Mul1(&c[1][0], &a)
-	avx.Mul10(&c[1][0], &c[1][0])
-	avx.Add(&c[1][1], &c[1][0], &c[1][0])
-	avx.Add(&c[1][0], &a, &c[1][1])
+	gf127.Mul10(&c[1][0], &c[1][0])
+	gf127.Add(&c[1][1], &c[1][0], &c[1][0])
+	gf127.Add(&c[1][0], &a, &c[1][1])
 
 	return c
 }
 
 func (c *sl2) Mul(a, b *sl2) *sl2 {
-	mul(a, b, c, new([4]GF127))
+	var x [4]GF127
+
+	gf127.Mul(&a[0][0], &b[0][0], &x[0])
+	gf127.Mul(&a[0][0], &b[0][1], &x[1])
+	gf127.Mul(&a[1][0], &b[0][0], &x[2])
+	gf127.Mul(&a[1][0], &b[0][1], &x[3])
+
+	gf127.Mul(&a[0][1], &b[1][0], &c[0][0])
+	gf127.Add(&c[0][0], &x[0], &c[0][0])
+	gf127.Mul(&a[0][1], &b[1][1], &c[0][1])
+	gf127.Add(&c[0][1], &x[1], &c[0][1])
+	gf127.Mul(&a[1][1], &b[1][0], &c[1][0])
+	gf127.Add(&c[1][0], &x[2], &c[1][0])
+	gf127.Mul(&a[1][1], &b[1][1], &c[1][1])
+	gf127.Add(&c[1][1], &x[3], &c[1][1])
 	return c
 }
 
@@ -169,15 +140,15 @@ func Inv(a *sl2) (b *sl2) {
 }
 
 func inv(a, b *sl2, t *[2]GF127) {
-	avx.Mul(&a[0][0], &a[1][1], &t[0])
-	avx.Mul(&a[0][1], &a[1][0], &t[1])
-	avx.Add(&t[0], &t[1], &t[0])
+	gf127.Mul(&a[0][0], &a[1][1], &t[0])
+	gf127.Mul(&a[0][1], &a[1][0], &t[1])
+	gf127.Add(&t[0], &t[1], &t[0])
 	gf127.Inv(&t[0], &t[1])
 
-	avx.Mul(&t[1], &a[0][0], &b[1][1])
-	avx.Mul(&t[1], &a[0][1], &b[0][1])
-	avx.Mul(&t[1], &a[1][0], &b[1][0])
-	avx.Mul(&t[1], &a[1][1], &b[0][0])
+	gf127.Mul(&t[1], &a[0][0], &b[1][1])
+	gf127.Mul(&t[1], &a[0][1], &b[0][1])
+	gf127.Mul(&t[1], &a[1][0], &b[1][0])
+	gf127.Mul(&t[1], &a[1][1], &b[0][0])
 }
 
 func (c *sl2) String() string {
diff --git a/tz/sl2_test.go b/tz/sl2_test.go
index d43fec8..884d64c 100644
--- a/tz/sl2_test.go
+++ b/tz/sl2_test.go
@@ -6,7 +6,6 @@ import (
 	"time"
 
 	"github.com/nspcc-dev/tzhash/gf127"
-	"github.com/nspcc-dev/tzhash/gf127/avx"
 	"github.com/stretchr/testify/require"
 )
 
@@ -22,12 +21,12 @@ func random() (a *sl2) {
 
 	// so that result is in SL2
 	// d = a^-1*(1+b*c)
-	avx.Mul(&a[0][1], &a[1][0], &a[1][1])
-	avx.Add(&a[1][1], gf127.New(1, 0), &a[1][1])
+	gf127.Mul(&a[0][1], &a[1][0], &a[1][1])
+	gf127.Add(&a[1][1], gf127.New(1, 0), &a[1][1])
 
 	t := gf127.New(0, 0)
 	gf127.Inv(&a[0][0], t)
-	avx.Mul(t, &a[1][1], &a[1][1])
+	gf127.Mul(t, &a[1][1], &a[1][1])
 
 	return
 }