diff --git a/go.mod b/go.mod index 1216e51..0249af4 100644 --- a/go.mod +++ b/go.mod @@ -1,3 +1,5 @@ module github.com/nspcc-dev/tzhash +go 1.12 + require github.com/stretchr/testify v1.3.0 diff --git a/tz/cpuid_x86.go b/internal/cpuid/cpuid_x86.go similarity index 92% rename from tz/cpuid_x86.go rename to internal/cpuid/cpuid_x86.go index 1d7630e..bf19137 100644 --- a/tz/cpuid_x86.go +++ b/internal/cpuid/cpuid_x86.go @@ -28,7 +28,7 @@ // +build 386 amd64 amd64p32 -package tz +package cpuid const ( bitOSXSAVE = 1 << 27 @@ -36,14 +36,19 @@ const ( bitAVX2 = 1 << 5 ) -func setFeatures() { +var ( + hasAVX bool + hasAVX2 bool +) + +func init() { maxID, _, _, _ := cpuid(0, 0) if maxID < 1 { return } _, _, ecx1, _ := cpuid(1, 0) - hasOSXSAVE = isSet(ecx1, bitOSXSAVE) + hasOSXSAVE := isSet(ecx1, bitOSXSAVE) osSupportsAVX := false if hasOSXSAVE { @@ -61,6 +66,9 @@ func setFeatures() { hasAVX2 = isSet(ebx7, bitAVX2) && osSupportsAVX } +func HasAVX() bool { return hasAVX } +func HasAVX2() bool { return hasAVX2 } + func isSet(hwc uint32, value uint32) bool { return hwc&value != 0 } diff --git a/tz/cpuid_x86.s b/internal/cpuid/cpuid_x86.s similarity index 100% rename from tz/cpuid_x86.s rename to internal/cpuid/cpuid_x86.s diff --git a/tz/avx2_inline_amd64.s b/tz/avx2_inline_amd64.s index d4b368c..fb7f83a 100644 --- a/tz/avx2_inline_amd64.s +++ b/tz/avx2_inline_amd64.s @@ -1,5 +1,28 @@ #include "textflag.h" +#define mask(bit, src, tmp, to1, to2) \ + MOVQ src, tmp \ + SHRQ bit, tmp \ + ANDQ $1, tmp \ + NEGQ tmp \ + MOVQ tmp, to1 \ + VPBROADCASTB to1, to2 + +#define mulBit(bit) \ + VPSLLQ $1, Y0, Y1 \ + VPALIGNR $8, Y1, Y0, Y2 \ + VPSRLQ $63, Y2, Y2 \ + VPXOR Y1, Y2, Y2 \ + VPSRLQ $63, Y1, Y3 \ + VPSLLQ $63, Y3, Y3 \ + VPUNPCKHQDQ Y3, Y3, Y3 \ + VPXOR Y2, Y3, Y3 \ + mask(bit, CX, DX, X1, Y2) \ + VPXOR Y3, Y8, Y3 \ + VPAND Y3, Y2, Y4 \ + VPXOR Y4, Y0, Y8 \ + VMOVDQA Y3, Y0 + // func mulByteRightx2(c00c10, c01c11 *[4]uint64, b byte) TEXT ·mulByteRightx2(SB),NOSPLIT,$0 MOVQ c00c10+0(FP), AX @@ -8,179 +31,16 @@ TEXT ·mulByteRightx2(SB),NOSPLIT,$0 VMOVDQA (BX), Y8 MOVB b+16(FP), CX - // 1 bit - VPSLLQ $1, Y0, Y1 - VPALIGNR $8, Y1, Y0, Y2 - VPSRLQ $63, Y2, Y2 - VPXOR Y1, Y2, Y2 - VPSRLQ $63, Y1, Y3 - VPSLLQ $63, Y3, Y3 - VPUNPCKHQDQ Y3, Y3, Y3 - VPXOR Y2, Y3, Y3 + mulBit($7) + mulBit($6) + mulBit($5) + mulBit($4) + mulBit($3) + mulBit($2) + mulBit($1) + mulBit($0) - MOVQ CX, DX - SHRQ $7, DX - ANDQ $1, DX - NEGQ DX - MOVQ DX, X1 - VPBROADCASTB X1, Y2 - - VPXOR Y3, Y8, Y3 - VPAND Y3, Y2, Y4 - VPXOR Y4, Y0, Y8 - VMOVDQA Y3, Y0 - - // 2 bit - VPSLLQ $1, Y0, Y1 - VPALIGNR $8, Y1, Y0, Y2 - VPSRLQ $63, Y2, Y2 - VPXOR Y1, Y2, Y2 - VPSRLQ $63, Y1, Y3 - VPSLLQ $63, Y3, Y3 - VPUNPCKHQDQ Y3, Y3, Y3 - VPXOR Y2, Y3, Y3 - - MOVQ CX, DX - SHRQ $6, DX - ANDQ $1, DX - NEGQ DX - MOVQ DX, X1 - VPBROADCASTB X1, Y2 - - VPXOR Y3, Y8, Y3 - VPAND Y3, Y2, Y4 - VPXOR Y4, Y0, Y8 - VMOVDQA Y3, Y0 - - // 3 bit - VPSLLQ $1, Y0, Y1 - VPALIGNR $8, Y1, Y0, Y2 - VPSRLQ $63, Y2, Y2 - VPXOR Y1, Y2, Y2 - VPSRLQ $63, Y1, Y3 - VPSLLQ $63, Y3, Y3 - VPUNPCKHQDQ Y3, Y3, Y3 - VPXOR Y2, Y3, Y3 - - MOVQ CX, DX - SHRQ $5, DX - ANDQ $1, DX - NEGQ DX - MOVQ DX, X1 - VPBROADCASTB X1, Y2 - - VPXOR Y3, Y8, Y3 - VPAND Y3, Y2, Y4 - VPXOR Y4, Y0, Y8 - VMOVDQA Y3, Y0 - - // 4 bit - VPSLLQ $1, Y0, Y1 - VPALIGNR $8, Y1, Y0, Y2 - VPSRLQ $63, Y2, Y2 - VPXOR Y1, Y2, Y2 - VPSRLQ $63, Y1, Y3 - VPSLLQ $63, Y3, Y3 - VPUNPCKHQDQ Y3, Y3, Y3 - VPXOR Y2, Y3, Y3 - - MOVQ CX, DX - SHRQ $4, DX - ANDQ $1, DX - NEGQ DX - MOVQ DX, X1 - VPBROADCASTB X1, Y2 - - VPXOR Y3, Y8, Y3 - VPAND Y3, Y2, Y4 - VPXOR Y4, Y0, Y8 - VMOVDQA Y3, Y0 - - // 5 bit - VPSLLQ $1, Y0, Y1 - VPALIGNR $8, Y1, Y0, Y2 - VPSRLQ $63, Y2, Y2 - VPXOR Y1, Y2, Y2 - VPSRLQ $63, Y1, Y3 - VPSLLQ $63, Y3, Y3 - VPUNPCKHQDQ Y3, Y3, Y3 - VPXOR Y2, Y3, Y3 - - MOVQ CX, DX - SHRQ $3, DX - ANDQ $1, DX - NEGQ DX - MOVQ DX, X1 - VPBROADCASTB X1, Y2 - - VPXOR Y3, Y8, Y3 - VPAND Y3, Y2, Y4 - VPXOR Y4, Y0, Y8 - VMOVDQA Y3, Y0 - - // 6 bit - VPSLLQ $1, Y0, Y1 - VPALIGNR $8, Y1, Y0, Y2 - VPSRLQ $63, Y2, Y2 - VPXOR Y1, Y2, Y2 - VPSRLQ $63, Y1, Y3 - VPSLLQ $63, Y3, Y3 - VPUNPCKHQDQ Y3, Y3, Y3 - VPXOR Y2, Y3, Y3 - - MOVQ CX, DX - SHRQ $2, DX - ANDQ $1, DX - NEGQ DX - MOVQ DX, X1 - VPBROADCASTB X1, Y2 - - VPXOR Y3, Y8, Y3 - VPAND Y3, Y2, Y4 - VPXOR Y4, Y0, Y8 - VMOVDQA Y3, Y0 - - // 7 bit - VPSLLQ $1, Y0, Y1 - VPALIGNR $8, Y1, Y0, Y2 - VPSRLQ $63, Y2, Y2 - VPXOR Y1, Y2, Y2 - VPSRLQ $63, Y1, Y3 - VPSLLQ $63, Y3, Y3 - VPUNPCKHQDQ Y3, Y3, Y3 - VPXOR Y2, Y3, Y3 - - MOVQ CX, DX - SHRQ $1, DX - ANDQ $1, DX - NEGQ DX - MOVQ DX, X1 - VPBROADCASTB X1, Y2 - - VPXOR Y3, Y8, Y3 - VPAND Y3, Y2, Y4 - VPXOR Y4, Y0, Y8 - VMOVDQA Y3, Y0 - - // 8 bit - VPSLLQ $1, Y0, Y1 - VPALIGNR $8, Y1, Y0, Y2 - VPSRLQ $63, Y2, Y2 - VPXOR Y1, Y2, Y2 - VPSRLQ $63, Y1, Y3 - VPSLLQ $63, Y3, Y3 - VPUNPCKHQDQ Y3, Y3, Y3 - VPXOR Y2, Y3, Y3 - - MOVQ CX, DX - ANDQ $1, DX - NEGQ DX - MOVQ DX, X1 - VPBROADCASTB X1, Y2 - - VPXOR Y3, Y8, Y3 - VPAND Y3, Y2, Y4 - VPXOR Y4, Y0, Y8 VMOVDQA Y8, (BX) - VMOVDQA Y3, (AX) + VMOVDQA Y0, (AX) + RET diff --git a/tz/hash.go b/tz/hash.go index b617d40..25eb2fa 100644 --- a/tz/hash.go +++ b/tz/hash.go @@ -6,6 +6,8 @@ package tz import ( "errors" "hash" + + "github.com/nspcc-dev/tzhash/internal/cpuid" ) type Implementation int @@ -22,9 +24,10 @@ const ( ) var ( - hasAVX bool - hasAVX2 bool - hasOSXSAVE bool + hasAVX = cpuid.HasAVX() + // Having AVX2 does not guarantee + // that AVX is also present. + hasAVX2 = cpuid.HasAVX2() && hasAVX ) func (impl Implementation) String() string { diff --git a/tz/sl2.go b/tz/sl2.go index 231de4e..30c0264 100644 --- a/tz/sl2.go +++ b/tz/sl2.go @@ -17,8 +17,6 @@ var id = sl2{ var mul func(a, b, c *sl2, x *[4]gf127.GF127) func init() { - setFeatures() - if hasAVX { mul = mulSL2AVX } else {